docparser 0.0.1 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +2 -0
- data/.rubocop.yml +5 -0
- data/.travis.yml +3 -0
- data/Gemfile +9 -1
- data/README.md +11 -4
- data/Rakefile +15 -0
- data/example.rb +9 -7
- data/lib/docparser.rb +1 -0
- data/lib/docparser/document.rb +18 -11
- data/lib/docparser/output.rb +8 -8
- data/lib/docparser/output/html_output.rb +53 -47
- data/lib/docparser/output/json_output.rb +8 -3
- data/lib/docparser/output/multi_output.rb +4 -8
- data/lib/docparser/output/nil_output.rb +21 -0
- data/lib/docparser/output/screen_output.rb +2 -1
- data/lib/docparser/output/xlsx_output.rb +12 -2
- data/lib/docparser/output/yaml_output.rb +6 -1
- data/lib/docparser/parser.rb +80 -49
- data/lib/docparser/version.rb +1 -1
- data/test/lib/docparser/blackbox_test.rb +29 -0
- data/test/lib/docparser/document_test.rb +134 -0
- data/test/lib/docparser/logging_test.rb +19 -0
- data/test/lib/docparser/output/csv_output_test.rb +51 -0
- data/test/lib/docparser/output/html_output_test.rb +57 -0
- data/test/lib/docparser/output/json_output_test.rb +65 -0
- data/test/lib/docparser/output/multi_output_test.rb +80 -0
- data/test/lib/docparser/output/nil_output_test.rb +27 -0
- data/test/lib/docparser/output/screen_output_test.rb +55 -0
- data/test/lib/docparser/output/xlsx_output_test.rb +53 -0
- data/test/lib/docparser/output/yaml_output_test.rb +76 -0
- data/test/lib/docparser/output_test.rb +85 -0
- data/test/lib/docparser/parser_test.rb +197 -0
- data/test/lib/docparser/version_test.rb +11 -0
- data/test/support/hackaday/dl.rb +4 -0
- data/test/support/hackaday/file_1.html +716 -0
- data/test/support/hackaday/file_10.html +791 -0
- data/test/support/hackaday/file_11.html +787 -0
- data/test/support/hackaday/file_12.html +715 -0
- data/test/support/hackaday/file_13.html +793 -0
- data/test/support/hackaday/file_14.html +718 -0
- data/test/support/hackaday/file_15.html +707 -0
- data/test/support/hackaday/file_16.html +713 -0
- data/test/support/hackaday/file_17.html +715 -0
- data/test/support/hackaday/file_18.html +725 -0
- data/test/support/hackaday/file_19.html +715 -0
- data/test/support/hackaday/file_2.html +793 -0
- data/test/support/hackaday/file_20.html +795 -0
- data/test/support/hackaday/file_21.html +804 -0
- data/test/support/hackaday/file_22.html +722 -0
- data/test/support/hackaday/file_23.html +793 -0
- data/test/support/hackaday/file_24.html +717 -0
- data/test/support/hackaday/file_25.html +715 -0
- data/test/support/hackaday/file_26.html +717 -0
- data/test/support/hackaday/file_27.html +723 -0
- data/test/support/hackaday/file_28.html +711 -0
- data/test/support/hackaday/file_29.html +711 -0
- data/test/support/hackaday/file_3.html +794 -0
- data/test/support/hackaday/file_30.html +715 -0
- data/test/support/hackaday/file_31.html +713 -0
- data/test/support/hackaday/file_32.html +714 -0
- data/test/support/hackaday/file_33.html +716 -0
- data/test/support/hackaday/file_34.html +714 -0
- data/test/support/hackaday/file_35.html +792 -0
- data/test/support/hackaday/file_36.html +719 -0
- data/test/support/hackaday/file_37.html +712 -0
- data/test/support/hackaday/file_38.html +709 -0
- data/test/support/hackaday/file_39.html +808 -0
- data/test/support/hackaday/file_4.html +814 -0
- data/test/support/hackaday/file_40.html +801 -0
- data/test/support/hackaday/file_5.html +715 -0
- data/test/support/hackaday/file_6.html +792 -0
- data/test/support/hackaday/file_7.html +714 -0
- data/test/support/hackaday/file_8.html +717 -0
- data/test/support/hackaday/file_9.html +719 -0
- data/test/support/test_encoding.html +12 -0
- data/test/support/test_encoding2.html +12 -0
- data/test/support/test_html.html +16 -0
- data/test/support/test_xml.xml +5 -0
- data/test/test_helper.rb +14 -0
- metadata +126 -3
@@ -0,0 +1,21 @@
|
|
1
|
+
module DocParser
|
2
|
+
# This Output is used for testing purposes.
|
3
|
+
|
4
|
+
# @see Output
|
5
|
+
class NilOutput < Output
|
6
|
+
# @!visibility private
|
7
|
+
|
8
|
+
def initialize
|
9
|
+
@rowcount = 0
|
10
|
+
end
|
11
|
+
|
12
|
+
def close
|
13
|
+
end
|
14
|
+
|
15
|
+
def write_row(row)
|
16
|
+
end
|
17
|
+
|
18
|
+
def add_row(row)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -1,7 +1,7 @@
|
|
1
1
|
require 'terminal-table'
|
2
2
|
require 'pageme'
|
3
3
|
module DocParser
|
4
|
-
# This Output
|
4
|
+
# This Output can be used for debugging purposes.
|
5
5
|
|
6
6
|
# It pipes all rows through a pager
|
7
7
|
# @see Output
|
@@ -25,6 +25,7 @@ module DocParser
|
|
25
25
|
end
|
26
26
|
|
27
27
|
def write_row(row)
|
28
|
+
raise MissingHeaderException if @header.nil? || @header.length == 0
|
28
29
|
out = []
|
29
30
|
0.upto(@header.length - 1) do |counter|
|
30
31
|
out << [@header[counter], row[counter]]
|
@@ -20,9 +20,19 @@ module DocParser
|
|
20
20
|
@sheet.add_row row
|
21
21
|
end
|
22
22
|
|
23
|
-
def
|
24
|
-
|
23
|
+
def footer
|
24
|
+
unless @header.nil?
|
25
|
+
@sheet.add_table "A1:#{@sheet.cells.last.r}", name: 'Data'
|
26
|
+
end
|
25
27
|
@package.serialize @filename
|
26
28
|
end
|
29
|
+
|
30
|
+
def rowcount
|
31
|
+
if @header.nil?
|
32
|
+
@sheet.rows.length
|
33
|
+
else
|
34
|
+
@sheet.rows.length - 1
|
35
|
+
end
|
36
|
+
end
|
27
37
|
end
|
28
38
|
end
|
@@ -6,9 +6,14 @@ module DocParser
|
|
6
6
|
class YAMLOutput < Output
|
7
7
|
# @!visibility private
|
8
8
|
def write_row(row)
|
9
|
+
raise MissingHeaderException if @header.nil? || @header.length == 0
|
9
10
|
@doc ||= {}
|
10
11
|
0.upto(@header.length - 1) do |counter|
|
11
|
-
|
12
|
+
if row.length > counter
|
13
|
+
@doc[@header[counter]] = row[counter]
|
14
|
+
else
|
15
|
+
@doc[@header[counter]] = ''
|
16
|
+
end
|
12
17
|
end
|
13
18
|
YAML.dump @doc, @file
|
14
19
|
end
|
data/lib/docparser/parser.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
|
1
|
+
$LOAD_PATH.unshift __dir__
|
2
2
|
require 'rubygems'
|
3
3
|
require 'bundler/setup'
|
4
4
|
require 'version'
|
@@ -8,6 +8,8 @@ require 'nokogiri'
|
|
8
8
|
require 'open-uri'
|
9
9
|
require 'parallel'
|
10
10
|
require 'set'
|
11
|
+
require 'log4r'
|
12
|
+
require 'log4r/formatter/patternformatter'
|
11
13
|
require 'output/screen_output.rb'
|
12
14
|
require 'output/csv_output.rb'
|
13
15
|
require 'output/html_output.rb'
|
@@ -15,14 +17,26 @@ require 'output/xlsx_output.rb'
|
|
15
17
|
require 'output/yaml_output.rb'
|
16
18
|
require 'output/json_output.rb'
|
17
19
|
require 'output/multi_output.rb'
|
18
|
-
|
20
|
+
require 'output/nil_output.rb'
|
21
|
+
|
22
|
+
Log4r.define_levels(*Log4r::Log4rConfig::LogLevels)
|
23
|
+
logger = Log4r::Logger.new('docparser')
|
24
|
+
output = Log4r::StdoutOutputter.new('docparser')
|
25
|
+
output.formatter = Log4r::PatternFormatter.new(pattern: '[%l %C] %d :: %m')
|
26
|
+
logger.outputters = output
|
27
|
+
logger.level = Log4r::INFO
|
28
|
+
logger = nil
|
29
|
+
output = nil
|
30
|
+
|
31
|
+
# The DocParser namespace
|
32
|
+
# See README.md for information on using DocParser
|
19
33
|
module DocParser
|
20
34
|
# The main parser class. This is the class you'll use to create your parser
|
21
35
|
# The real work happens in the Document class
|
22
36
|
# @see Document
|
23
37
|
class Parser
|
24
38
|
# @!visibility private
|
25
|
-
attr_reader :outputs
|
39
|
+
attr_reader :outputs, :files, :num_processes, :encoding
|
26
40
|
|
27
41
|
# Creates a new parser instance
|
28
42
|
# @param files [Array] An array containing URLs or paths to files
|
@@ -33,73 +47,90 @@ module DocParser
|
|
33
47
|
# @param range [Range] Range of files to process (nil means process all)
|
34
48
|
# @param num_processes [Fixnum] Number of parallel processes
|
35
49
|
def initialize(files: [], quiet: false, encoding: 'utf-8', parallel: true,
|
36
|
-
output:
|
50
|
+
output: nil, range: nil,
|
37
51
|
num_processes: Parallel.processor_count + 1)
|
38
|
-
@
|
39
|
-
@
|
40
|
-
@num_processes = num_processes
|
52
|
+
@num_processes = parallel ? num_processes : 1
|
53
|
+
@files = range ? files[range] : files
|
41
54
|
@encoding = encoding
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
55
|
+
|
56
|
+
Log4r::Logger['docparser'].level = quiet ? Log4r::ERROR : Log4r::INFO
|
57
|
+
|
58
|
+
unless output.nil?
|
59
|
+
if output.is_a? Output
|
60
|
+
@outputs = []
|
61
|
+
@outputs << output
|
62
|
+
elsif output.is_a?(Array) && output.all? { |o| o.is_a? Output }
|
63
|
+
@outputs = output
|
64
|
+
else
|
65
|
+
raise ArgumentError, 'Invalid outputs specified'
|
66
|
+
end
|
67
|
+
|
68
|
+
@resultsets = Array.new(@outputs.length) { Set.new }
|
54
69
|
end
|
55
|
-
|
56
|
-
|
70
|
+
|
71
|
+
@logger = Log4r::Logger.new('docparser::parser')
|
72
|
+
@logger.info "DocParser v#{VERSION}"
|
73
|
+
@logger.info "#{@files.length} files loaded (encoding: #{@encoding})"
|
57
74
|
end
|
58
75
|
|
59
76
|
#
|
60
77
|
# Parses the `files`
|
61
78
|
#
|
62
79
|
def parse!(&block)
|
63
|
-
|
80
|
+
@logger.info "Parsing #{@files.length} files."
|
64
81
|
start_time = Time.now
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
log "Starting #{@num_processes} processes"
|
69
|
-
Parallel.map(@files, in_processes: @num_processes) do |file|
|
70
|
-
Document.new(file, encoding: @encoding, parser: self).parse!(&block)
|
71
|
-
end.each do |result|
|
72
|
-
result.each_with_index { |set, index| resultsets[index].merge(set) }
|
73
|
-
end
|
74
|
-
log 'Parallel processing finished, writing results..'
|
82
|
+
|
83
|
+
if @num_processes > 1
|
84
|
+
parallel_process(&block)
|
75
85
|
else
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
86
|
+
serial_process(&block)
|
87
|
+
end
|
88
|
+
|
89
|
+
@logger.info 'Processing finished'
|
90
|
+
|
91
|
+
write_to_outputs if @outputs
|
92
|
+
|
93
|
+
@logger.info sprintf('Done processing in %.2fs.', Time.now - start_time)
|
94
|
+
end
|
95
|
+
|
96
|
+
private
|
97
|
+
|
98
|
+
def parallel_process(&block)
|
99
|
+
@logger.info "Starting #{@num_processes} processes"
|
100
|
+
Parallel.map(@files, in_processes: @num_processes) do |file|
|
101
|
+
# :nocov: #
|
102
|
+
parse_doc(file, &block)
|
103
|
+
# :nocov: #
|
104
|
+
end.each do |result|
|
105
|
+
result.each_with_index do |set, index|
|
106
|
+
@resultsets[index].merge(set)
|
107
|
+
end if @outputs
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
def serial_process(&block)
|
112
|
+
@files.each do |file|
|
113
|
+
parse_doc(file, &block).each_with_index do |set, index|
|
114
|
+
@resultsets[index].merge(set) if @outputs
|
81
115
|
end
|
82
116
|
end
|
117
|
+
end
|
83
118
|
|
84
|
-
|
119
|
+
def parse_doc(file, &block)
|
120
|
+
doc = Document.new(filename: file, encoding: @encoding, parser: self)
|
121
|
+
doc.parse!(&block)
|
122
|
+
end
|
85
123
|
|
124
|
+
def write_to_outputs
|
125
|
+
@logger.info 'Writing data..'
|
86
126
|
@outputs.each_with_index do |output, index|
|
87
|
-
resultsets[index].each do |row|
|
127
|
+
@resultsets[index].each do |row|
|
88
128
|
output.add_row row
|
89
129
|
end
|
90
|
-
resultsets[index] = nil
|
130
|
+
@resultsets[index] = nil
|
91
131
|
output.close
|
92
|
-
log output.summary
|
93
132
|
end
|
94
|
-
|
95
|
-
log ''
|
96
|
-
log 'Done processing in %.2fs.' % (Time.now - start_time)
|
97
133
|
end
|
98
134
|
|
99
|
-
private
|
100
|
-
|
101
|
-
def log(str)
|
102
|
-
puts str unless @quiet
|
103
|
-
end
|
104
135
|
end
|
105
136
|
end
|
data/lib/docparser/version.rb
CHANGED
@@ -0,0 +1,29 @@
|
|
1
|
+
require_relative '../../test_helper'
|
2
|
+
require 'open3'
|
3
|
+
require 'shellwords'
|
4
|
+
|
5
|
+
def cmd_to_sys(command)
|
6
|
+
Open3.popen3(command) do |stdin, stdout, stderr|
|
7
|
+
[stdout.read, stderr.read]
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
describe DocParser do
|
12
|
+
it 'should run the example without problems' do
|
13
|
+
curwd = Dir.getwd
|
14
|
+
Dir.mktmpdir do |dir|
|
15
|
+
Dir.chdir(dir)
|
16
|
+
example_file = Shellwords.escape(File.join($ROOT_DIR, 'example.rb'))
|
17
|
+
out, err = cmd_to_sys '/usr/bin/env ruby ' + example_file
|
18
|
+
err.must_be_empty
|
19
|
+
rows = out.scan(/(\d+) rows/).flatten
|
20
|
+
rows.length.must_equal 5
|
21
|
+
row_lengths = rows.group_by { |elem| elem.to_i }
|
22
|
+
row_lengths.length.must_equal 1
|
23
|
+
# HaD: 40 pages of 7 articles
|
24
|
+
row_lengths.keys.first.must_equal(7 * 40)
|
25
|
+
out.must_match(/Done processing/)
|
26
|
+
end
|
27
|
+
Dir.chdir(curwd)
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,134 @@
|
|
1
|
+
require_relative '../../test_helper'
|
2
|
+
describe DocParser::Document do
|
3
|
+
before do
|
4
|
+
Log4r::Logger['docparser'].level = Log4r::INFO
|
5
|
+
$output = DocParser::NilOutput.new
|
6
|
+
@parser = Class.new do
|
7
|
+
define_method(:outputs) { [$output] }
|
8
|
+
end.new
|
9
|
+
@test_doc_path = File.join($SUPPORT_DIR, 'test_html.html')
|
10
|
+
@test_doc = DocParser::Document.new(filename: @test_doc_path,
|
11
|
+
parser: @parser)
|
12
|
+
end
|
13
|
+
|
14
|
+
it 'should read HTML contents' do
|
15
|
+
file = File.join($SUPPORT_DIR, 'test_html.html')
|
16
|
+
doc = DocParser::Document.new(filename: file, parser: @parser)
|
17
|
+
doc.doc.must_be_instance_of Nokogiri::HTML::Document
|
18
|
+
doc.html.must_equal(open(file).read)
|
19
|
+
end
|
20
|
+
|
21
|
+
it 'should read XML contents' do
|
22
|
+
file = File.join($SUPPORT_DIR, 'test_xml.xml')
|
23
|
+
doc = DocParser::Document.new(filename: file, parser: @parser)
|
24
|
+
doc.doc.must_be_instance_of Nokogiri::XML::Document
|
25
|
+
doc.html.must_equal(open(file).read)
|
26
|
+
doc.xpath_content('xmltest > title').must_equal('Test XML')
|
27
|
+
doc.xpath_content('xmltest > test').must_equal('Character Data')
|
28
|
+
end
|
29
|
+
|
30
|
+
it 'should read remote contents' do
|
31
|
+
url = 'https://gist.github.com/jurriaan/3f2750aa546e3e6719cf/raw'
|
32
|
+
doc = DocParser::Document.new(filename: url, parser: @parser)
|
33
|
+
doc.html.must_equal(open(url).read)
|
34
|
+
end
|
35
|
+
|
36
|
+
it 'should use the correct encoding' do
|
37
|
+
file = File.join($SUPPORT_DIR, 'test_encoding.html')
|
38
|
+
file2 = File.join($SUPPORT_DIR, 'test_encoding2.html')
|
39
|
+
doc = DocParser::Document.new(filename: file, parser: @parser)
|
40
|
+
doc2 = DocParser::Document.new(filename: file2,
|
41
|
+
parser: @parser,
|
42
|
+
encoding: 'iso-8859-1')
|
43
|
+
doc.html.must_equal(doc2.html)
|
44
|
+
doc.css_content('#encoding').must_equal(doc2.css_content('#encoding'))
|
45
|
+
end
|
46
|
+
|
47
|
+
it 'should specify filename and encoding in #inspect' do
|
48
|
+
@test_doc.inspect.must_include(@test_doc.filename)
|
49
|
+
@test_doc.inspect.must_include(@test_doc.encoding)
|
50
|
+
end
|
51
|
+
|
52
|
+
it 'should get the title of a document' do
|
53
|
+
@test_doc.title.must_equal('Test HTML')
|
54
|
+
end
|
55
|
+
|
56
|
+
it 'should store the path to the document' do
|
57
|
+
@test_doc.filename.must_equal(@test_doc_path)
|
58
|
+
end
|
59
|
+
|
60
|
+
it 'should be possible to use css queries' do
|
61
|
+
css = 'article > h1 + p'
|
62
|
+
css_content = @test_doc.css_content(css)
|
63
|
+
css_element = @test_doc.css(css)
|
64
|
+
css_content.must_equal('Great article it is')
|
65
|
+
css_content.must_equal(css_element.first.content)
|
66
|
+
end
|
67
|
+
|
68
|
+
it 'should be possible to use xpath queries' do
|
69
|
+
xpath = '//li/ancestor::article/h1'
|
70
|
+
xpath_content = @test_doc.xpath_content(xpath)
|
71
|
+
xpath_element = @test_doc.xpath(xpath)
|
72
|
+
xpath_content.must_equal('This is an article')
|
73
|
+
xpath_content.must_equal(xpath_element.first.content)
|
74
|
+
end
|
75
|
+
|
76
|
+
it 'should be possible to use regular expressions' do
|
77
|
+
regex = @test_doc.regexp(/\<h1\>([^\<])*/)
|
78
|
+
regex.must_equal(@test_doc.html.match(/\<h1\>([^\<])*/))
|
79
|
+
end
|
80
|
+
|
81
|
+
it 'should be possible to use blocks on query methods' do
|
82
|
+
array = []
|
83
|
+
@test_doc.css('p') do |element|
|
84
|
+
array << element.content
|
85
|
+
end
|
86
|
+
array.last.must_equal('This is the last paragraph')
|
87
|
+
array2 = []
|
88
|
+
@test_doc.xpath('//p') do |element|
|
89
|
+
array2 << element.content
|
90
|
+
end
|
91
|
+
array2.must_equal(array)
|
92
|
+
end
|
93
|
+
|
94
|
+
it 'should warn when providing an empty file' do
|
95
|
+
file = Tempfile.new('empty')
|
96
|
+
file.write('')
|
97
|
+
file.close
|
98
|
+
|
99
|
+
open(file.path).read.empty?.must_equal true
|
100
|
+
_, err = capture_io do
|
101
|
+
# Switch to hijacked IO
|
102
|
+
Log4r::Outputter['docparser'].instance_variable_set(:@out, $stderr)
|
103
|
+
DocParser::Document.new(filename: file.path, parser: @parser)
|
104
|
+
end
|
105
|
+
# Restore IO
|
106
|
+
Log4r::Outputter['docparser'].instance_variable_set(:@out, $stderr)
|
107
|
+
err.must_include "#{file.path} is empty"
|
108
|
+
end
|
109
|
+
|
110
|
+
it 'should add the row to the results' do
|
111
|
+
@test_doc.add_row ['test']
|
112
|
+
@test_doc.add_row 'test', 'test2'
|
113
|
+
@test_doc.results.must_equal [[['test'], ['test', 'test2']]]
|
114
|
+
end
|
115
|
+
|
116
|
+
it 'should be possible to specify outputs directly' do
|
117
|
+
@test_doc.add_row ['test!'], output: $output
|
118
|
+
@test_doc.results.must_equal [[['test!']]]
|
119
|
+
end
|
120
|
+
|
121
|
+
it 'should be possible to use multiple outputs' do
|
122
|
+
output = DocParser::NilOutput.new
|
123
|
+
output2 = DocParser::NilOutput.new
|
124
|
+
parser = Class.new do
|
125
|
+
define_method(:outputs) { [output, output2] }
|
126
|
+
end.new
|
127
|
+
test_doc = DocParser::Document.new(filename: @test_doc_path,
|
128
|
+
parser: parser)
|
129
|
+
test_doc.add_row ['a'], output: 1
|
130
|
+
test_doc.add_row ['b'], output: 0
|
131
|
+
test_doc.results.must_equal [[['b']], [['a']]]
|
132
|
+
end
|
133
|
+
|
134
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
require_relative '../../test_helper'
|
2
|
+
|
3
|
+
describe DocParser do
|
4
|
+
it 'should have a valid logger' do
|
5
|
+
logger = Log4r::Logger['docparser']
|
6
|
+
logger.wont_be_nil
|
7
|
+
logger.must_be_instance_of Log4r::Logger
|
8
|
+
end
|
9
|
+
|
10
|
+
it 'must have the correct loglevel by default' do
|
11
|
+
Log4r::Logger['docparser'].level.must_equal Log4r::INFO
|
12
|
+
end
|
13
|
+
|
14
|
+
it 'should log to the correct output' do
|
15
|
+
outputters = Log4r::Logger['docparser'].outputters
|
16
|
+
outputters.length.must_equal 1
|
17
|
+
outputters.first.must_be_instance_of Log4r::StdoutOutputter
|
18
|
+
end
|
19
|
+
end
|