docparser 0.0.1 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +2 -0
- data/.rubocop.yml +5 -0
- data/.travis.yml +3 -0
- data/Gemfile +9 -1
- data/README.md +11 -4
- data/Rakefile +15 -0
- data/example.rb +9 -7
- data/lib/docparser.rb +1 -0
- data/lib/docparser/document.rb +18 -11
- data/lib/docparser/output.rb +8 -8
- data/lib/docparser/output/html_output.rb +53 -47
- data/lib/docparser/output/json_output.rb +8 -3
- data/lib/docparser/output/multi_output.rb +4 -8
- data/lib/docparser/output/nil_output.rb +21 -0
- data/lib/docparser/output/screen_output.rb +2 -1
- data/lib/docparser/output/xlsx_output.rb +12 -2
- data/lib/docparser/output/yaml_output.rb +6 -1
- data/lib/docparser/parser.rb +80 -49
- data/lib/docparser/version.rb +1 -1
- data/test/lib/docparser/blackbox_test.rb +29 -0
- data/test/lib/docparser/document_test.rb +134 -0
- data/test/lib/docparser/logging_test.rb +19 -0
- data/test/lib/docparser/output/csv_output_test.rb +51 -0
- data/test/lib/docparser/output/html_output_test.rb +57 -0
- data/test/lib/docparser/output/json_output_test.rb +65 -0
- data/test/lib/docparser/output/multi_output_test.rb +80 -0
- data/test/lib/docparser/output/nil_output_test.rb +27 -0
- data/test/lib/docparser/output/screen_output_test.rb +55 -0
- data/test/lib/docparser/output/xlsx_output_test.rb +53 -0
- data/test/lib/docparser/output/yaml_output_test.rb +76 -0
- data/test/lib/docparser/output_test.rb +85 -0
- data/test/lib/docparser/parser_test.rb +197 -0
- data/test/lib/docparser/version_test.rb +11 -0
- data/test/support/hackaday/dl.rb +4 -0
- data/test/support/hackaday/file_1.html +716 -0
- data/test/support/hackaday/file_10.html +791 -0
- data/test/support/hackaday/file_11.html +787 -0
- data/test/support/hackaday/file_12.html +715 -0
- data/test/support/hackaday/file_13.html +793 -0
- data/test/support/hackaday/file_14.html +718 -0
- data/test/support/hackaday/file_15.html +707 -0
- data/test/support/hackaday/file_16.html +713 -0
- data/test/support/hackaday/file_17.html +715 -0
- data/test/support/hackaday/file_18.html +725 -0
- data/test/support/hackaday/file_19.html +715 -0
- data/test/support/hackaday/file_2.html +793 -0
- data/test/support/hackaday/file_20.html +795 -0
- data/test/support/hackaday/file_21.html +804 -0
- data/test/support/hackaday/file_22.html +722 -0
- data/test/support/hackaday/file_23.html +793 -0
- data/test/support/hackaday/file_24.html +717 -0
- data/test/support/hackaday/file_25.html +715 -0
- data/test/support/hackaday/file_26.html +717 -0
- data/test/support/hackaday/file_27.html +723 -0
- data/test/support/hackaday/file_28.html +711 -0
- data/test/support/hackaday/file_29.html +711 -0
- data/test/support/hackaday/file_3.html +794 -0
- data/test/support/hackaday/file_30.html +715 -0
- data/test/support/hackaday/file_31.html +713 -0
- data/test/support/hackaday/file_32.html +714 -0
- data/test/support/hackaday/file_33.html +716 -0
- data/test/support/hackaday/file_34.html +714 -0
- data/test/support/hackaday/file_35.html +792 -0
- data/test/support/hackaday/file_36.html +719 -0
- data/test/support/hackaday/file_37.html +712 -0
- data/test/support/hackaday/file_38.html +709 -0
- data/test/support/hackaday/file_39.html +808 -0
- data/test/support/hackaday/file_4.html +814 -0
- data/test/support/hackaday/file_40.html +801 -0
- data/test/support/hackaday/file_5.html +715 -0
- data/test/support/hackaday/file_6.html +792 -0
- data/test/support/hackaday/file_7.html +714 -0
- data/test/support/hackaday/file_8.html +717 -0
- data/test/support/hackaday/file_9.html +719 -0
- data/test/support/test_encoding.html +12 -0
- data/test/support/test_encoding2.html +12 -0
- data/test/support/test_html.html +16 -0
- data/test/support/test_xml.xml +5 -0
- data/test/test_helper.rb +14 -0
- metadata +126 -3
@@ -0,0 +1,21 @@
|
|
1
|
+
module DocParser
|
2
|
+
# This Output is used for testing purposes.
|
3
|
+
|
4
|
+
# @see Output
|
5
|
+
class NilOutput < Output
|
6
|
+
# @!visibility private
|
7
|
+
|
8
|
+
def initialize
|
9
|
+
@rowcount = 0
|
10
|
+
end
|
11
|
+
|
12
|
+
def close
|
13
|
+
end
|
14
|
+
|
15
|
+
def write_row(row)
|
16
|
+
end
|
17
|
+
|
18
|
+
def add_row(row)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -1,7 +1,7 @@
|
|
1
1
|
require 'terminal-table'
|
2
2
|
require 'pageme'
|
3
3
|
module DocParser
|
4
|
-
# This Output
|
4
|
+
# This Output can be used for debugging purposes.
|
5
5
|
|
6
6
|
# It pipes all rows through a pager
|
7
7
|
# @see Output
|
@@ -25,6 +25,7 @@ module DocParser
|
|
25
25
|
end
|
26
26
|
|
27
27
|
def write_row(row)
|
28
|
+
raise MissingHeaderException if @header.nil? || @header.length == 0
|
28
29
|
out = []
|
29
30
|
0.upto(@header.length - 1) do |counter|
|
30
31
|
out << [@header[counter], row[counter]]
|
@@ -20,9 +20,19 @@ module DocParser
|
|
20
20
|
@sheet.add_row row
|
21
21
|
end
|
22
22
|
|
23
|
-
def
|
24
|
-
|
23
|
+
def footer
|
24
|
+
unless @header.nil?
|
25
|
+
@sheet.add_table "A1:#{@sheet.cells.last.r}", name: 'Data'
|
26
|
+
end
|
25
27
|
@package.serialize @filename
|
26
28
|
end
|
29
|
+
|
30
|
+
def rowcount
|
31
|
+
if @header.nil?
|
32
|
+
@sheet.rows.length
|
33
|
+
else
|
34
|
+
@sheet.rows.length - 1
|
35
|
+
end
|
36
|
+
end
|
27
37
|
end
|
28
38
|
end
|
@@ -6,9 +6,14 @@ module DocParser
|
|
6
6
|
class YAMLOutput < Output
|
7
7
|
# @!visibility private
|
8
8
|
def write_row(row)
|
9
|
+
raise MissingHeaderException if @header.nil? || @header.length == 0
|
9
10
|
@doc ||= {}
|
10
11
|
0.upto(@header.length - 1) do |counter|
|
11
|
-
|
12
|
+
if row.length > counter
|
13
|
+
@doc[@header[counter]] = row[counter]
|
14
|
+
else
|
15
|
+
@doc[@header[counter]] = ''
|
16
|
+
end
|
12
17
|
end
|
13
18
|
YAML.dump @doc, @file
|
14
19
|
end
|
data/lib/docparser/parser.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
|
1
|
+
$LOAD_PATH.unshift __dir__
|
2
2
|
require 'rubygems'
|
3
3
|
require 'bundler/setup'
|
4
4
|
require 'version'
|
@@ -8,6 +8,8 @@ require 'nokogiri'
|
|
8
8
|
require 'open-uri'
|
9
9
|
require 'parallel'
|
10
10
|
require 'set'
|
11
|
+
require 'log4r'
|
12
|
+
require 'log4r/formatter/patternformatter'
|
11
13
|
require 'output/screen_output.rb'
|
12
14
|
require 'output/csv_output.rb'
|
13
15
|
require 'output/html_output.rb'
|
@@ -15,14 +17,26 @@ require 'output/xlsx_output.rb'
|
|
15
17
|
require 'output/yaml_output.rb'
|
16
18
|
require 'output/json_output.rb'
|
17
19
|
require 'output/multi_output.rb'
|
18
|
-
|
20
|
+
require 'output/nil_output.rb'
|
21
|
+
|
22
|
+
Log4r.define_levels(*Log4r::Log4rConfig::LogLevels)
|
23
|
+
logger = Log4r::Logger.new('docparser')
|
24
|
+
output = Log4r::StdoutOutputter.new('docparser')
|
25
|
+
output.formatter = Log4r::PatternFormatter.new(pattern: '[%l %C] %d :: %m')
|
26
|
+
logger.outputters = output
|
27
|
+
logger.level = Log4r::INFO
|
28
|
+
logger = nil
|
29
|
+
output = nil
|
30
|
+
|
31
|
+
# The DocParser namespace
|
32
|
+
# See README.md for information on using DocParser
|
19
33
|
module DocParser
|
20
34
|
# The main parser class. This is the class you'll use to create your parser
|
21
35
|
# The real work happens in the Document class
|
22
36
|
# @see Document
|
23
37
|
class Parser
|
24
38
|
# @!visibility private
|
25
|
-
attr_reader :outputs
|
39
|
+
attr_reader :outputs, :files, :num_processes, :encoding
|
26
40
|
|
27
41
|
# Creates a new parser instance
|
28
42
|
# @param files [Array] An array containing URLs or paths to files
|
@@ -33,73 +47,90 @@ module DocParser
|
|
33
47
|
# @param range [Range] Range of files to process (nil means process all)
|
34
48
|
# @param num_processes [Fixnum] Number of parallel processes
|
35
49
|
def initialize(files: [], quiet: false, encoding: 'utf-8', parallel: true,
|
36
|
-
output:
|
50
|
+
output: nil, range: nil,
|
37
51
|
num_processes: Parallel.processor_count + 1)
|
38
|
-
@
|
39
|
-
@
|
40
|
-
@num_processes = num_processes
|
52
|
+
@num_processes = parallel ? num_processes : 1
|
53
|
+
@files = range ? files[range] : files
|
41
54
|
@encoding = encoding
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
55
|
+
|
56
|
+
Log4r::Logger['docparser'].level = quiet ? Log4r::ERROR : Log4r::INFO
|
57
|
+
|
58
|
+
unless output.nil?
|
59
|
+
if output.is_a? Output
|
60
|
+
@outputs = []
|
61
|
+
@outputs << output
|
62
|
+
elsif output.is_a?(Array) && output.all? { |o| o.is_a? Output }
|
63
|
+
@outputs = output
|
64
|
+
else
|
65
|
+
raise ArgumentError, 'Invalid outputs specified'
|
66
|
+
end
|
67
|
+
|
68
|
+
@resultsets = Array.new(@outputs.length) { Set.new }
|
54
69
|
end
|
55
|
-
|
56
|
-
|
70
|
+
|
71
|
+
@logger = Log4r::Logger.new('docparser::parser')
|
72
|
+
@logger.info "DocParser v#{VERSION}"
|
73
|
+
@logger.info "#{@files.length} files loaded (encoding: #{@encoding})"
|
57
74
|
end
|
58
75
|
|
59
76
|
#
|
60
77
|
# Parses the `files`
|
61
78
|
#
|
62
79
|
def parse!(&block)
|
63
|
-
|
80
|
+
@logger.info "Parsing #{@files.length} files."
|
64
81
|
start_time = Time.now
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
log "Starting #{@num_processes} processes"
|
69
|
-
Parallel.map(@files, in_processes: @num_processes) do |file|
|
70
|
-
Document.new(file, encoding: @encoding, parser: self).parse!(&block)
|
71
|
-
end.each do |result|
|
72
|
-
result.each_with_index { |set, index| resultsets[index].merge(set) }
|
73
|
-
end
|
74
|
-
log 'Parallel processing finished, writing results..'
|
82
|
+
|
83
|
+
if @num_processes > 1
|
84
|
+
parallel_process(&block)
|
75
85
|
else
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
86
|
+
serial_process(&block)
|
87
|
+
end
|
88
|
+
|
89
|
+
@logger.info 'Processing finished'
|
90
|
+
|
91
|
+
write_to_outputs if @outputs
|
92
|
+
|
93
|
+
@logger.info sprintf('Done processing in %.2fs.', Time.now - start_time)
|
94
|
+
end
|
95
|
+
|
96
|
+
private
|
97
|
+
|
98
|
+
def parallel_process(&block)
|
99
|
+
@logger.info "Starting #{@num_processes} processes"
|
100
|
+
Parallel.map(@files, in_processes: @num_processes) do |file|
|
101
|
+
# :nocov: #
|
102
|
+
parse_doc(file, &block)
|
103
|
+
# :nocov: #
|
104
|
+
end.each do |result|
|
105
|
+
result.each_with_index do |set, index|
|
106
|
+
@resultsets[index].merge(set)
|
107
|
+
end if @outputs
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
def serial_process(&block)
|
112
|
+
@files.each do |file|
|
113
|
+
parse_doc(file, &block).each_with_index do |set, index|
|
114
|
+
@resultsets[index].merge(set) if @outputs
|
81
115
|
end
|
82
116
|
end
|
117
|
+
end
|
83
118
|
|
84
|
-
|
119
|
+
def parse_doc(file, &block)
|
120
|
+
doc = Document.new(filename: file, encoding: @encoding, parser: self)
|
121
|
+
doc.parse!(&block)
|
122
|
+
end
|
85
123
|
|
124
|
+
def write_to_outputs
|
125
|
+
@logger.info 'Writing data..'
|
86
126
|
@outputs.each_with_index do |output, index|
|
87
|
-
resultsets[index].each do |row|
|
127
|
+
@resultsets[index].each do |row|
|
88
128
|
output.add_row row
|
89
129
|
end
|
90
|
-
resultsets[index] = nil
|
130
|
+
@resultsets[index] = nil
|
91
131
|
output.close
|
92
|
-
log output.summary
|
93
132
|
end
|
94
|
-
|
95
|
-
log ''
|
96
|
-
log 'Done processing in %.2fs.' % (Time.now - start_time)
|
97
133
|
end
|
98
134
|
|
99
|
-
private
|
100
|
-
|
101
|
-
def log(str)
|
102
|
-
puts str unless @quiet
|
103
|
-
end
|
104
135
|
end
|
105
136
|
end
|
data/lib/docparser/version.rb
CHANGED
@@ -0,0 +1,29 @@
|
|
1
|
+
require_relative '../../test_helper'
|
2
|
+
require 'open3'
|
3
|
+
require 'shellwords'
|
4
|
+
|
5
|
+
def cmd_to_sys(command)
|
6
|
+
Open3.popen3(command) do |stdin, stdout, stderr|
|
7
|
+
[stdout.read, stderr.read]
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
describe DocParser do
|
12
|
+
it 'should run the example without problems' do
|
13
|
+
curwd = Dir.getwd
|
14
|
+
Dir.mktmpdir do |dir|
|
15
|
+
Dir.chdir(dir)
|
16
|
+
example_file = Shellwords.escape(File.join($ROOT_DIR, 'example.rb'))
|
17
|
+
out, err = cmd_to_sys '/usr/bin/env ruby ' + example_file
|
18
|
+
err.must_be_empty
|
19
|
+
rows = out.scan(/(\d+) rows/).flatten
|
20
|
+
rows.length.must_equal 5
|
21
|
+
row_lengths = rows.group_by { |elem| elem.to_i }
|
22
|
+
row_lengths.length.must_equal 1
|
23
|
+
# HaD: 40 pages of 7 articles
|
24
|
+
row_lengths.keys.first.must_equal(7 * 40)
|
25
|
+
out.must_match(/Done processing/)
|
26
|
+
end
|
27
|
+
Dir.chdir(curwd)
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,134 @@
|
|
1
|
+
require_relative '../../test_helper'
|
2
|
+
describe DocParser::Document do
|
3
|
+
before do
|
4
|
+
Log4r::Logger['docparser'].level = Log4r::INFO
|
5
|
+
$output = DocParser::NilOutput.new
|
6
|
+
@parser = Class.new do
|
7
|
+
define_method(:outputs) { [$output] }
|
8
|
+
end.new
|
9
|
+
@test_doc_path = File.join($SUPPORT_DIR, 'test_html.html')
|
10
|
+
@test_doc = DocParser::Document.new(filename: @test_doc_path,
|
11
|
+
parser: @parser)
|
12
|
+
end
|
13
|
+
|
14
|
+
it 'should read HTML contents' do
|
15
|
+
file = File.join($SUPPORT_DIR, 'test_html.html')
|
16
|
+
doc = DocParser::Document.new(filename: file, parser: @parser)
|
17
|
+
doc.doc.must_be_instance_of Nokogiri::HTML::Document
|
18
|
+
doc.html.must_equal(open(file).read)
|
19
|
+
end
|
20
|
+
|
21
|
+
it 'should read XML contents' do
|
22
|
+
file = File.join($SUPPORT_DIR, 'test_xml.xml')
|
23
|
+
doc = DocParser::Document.new(filename: file, parser: @parser)
|
24
|
+
doc.doc.must_be_instance_of Nokogiri::XML::Document
|
25
|
+
doc.html.must_equal(open(file).read)
|
26
|
+
doc.xpath_content('xmltest > title').must_equal('Test XML')
|
27
|
+
doc.xpath_content('xmltest > test').must_equal('Character Data')
|
28
|
+
end
|
29
|
+
|
30
|
+
it 'should read remote contents' do
|
31
|
+
url = 'https://gist.github.com/jurriaan/3f2750aa546e3e6719cf/raw'
|
32
|
+
doc = DocParser::Document.new(filename: url, parser: @parser)
|
33
|
+
doc.html.must_equal(open(url).read)
|
34
|
+
end
|
35
|
+
|
36
|
+
it 'should use the correct encoding' do
|
37
|
+
file = File.join($SUPPORT_DIR, 'test_encoding.html')
|
38
|
+
file2 = File.join($SUPPORT_DIR, 'test_encoding2.html')
|
39
|
+
doc = DocParser::Document.new(filename: file, parser: @parser)
|
40
|
+
doc2 = DocParser::Document.new(filename: file2,
|
41
|
+
parser: @parser,
|
42
|
+
encoding: 'iso-8859-1')
|
43
|
+
doc.html.must_equal(doc2.html)
|
44
|
+
doc.css_content('#encoding').must_equal(doc2.css_content('#encoding'))
|
45
|
+
end
|
46
|
+
|
47
|
+
it 'should specify filename and encoding in #inspect' do
|
48
|
+
@test_doc.inspect.must_include(@test_doc.filename)
|
49
|
+
@test_doc.inspect.must_include(@test_doc.encoding)
|
50
|
+
end
|
51
|
+
|
52
|
+
it 'should get the title of a document' do
|
53
|
+
@test_doc.title.must_equal('Test HTML')
|
54
|
+
end
|
55
|
+
|
56
|
+
it 'should store the path to the document' do
|
57
|
+
@test_doc.filename.must_equal(@test_doc_path)
|
58
|
+
end
|
59
|
+
|
60
|
+
it 'should be possible to use css queries' do
|
61
|
+
css = 'article > h1 + p'
|
62
|
+
css_content = @test_doc.css_content(css)
|
63
|
+
css_element = @test_doc.css(css)
|
64
|
+
css_content.must_equal('Great article it is')
|
65
|
+
css_content.must_equal(css_element.first.content)
|
66
|
+
end
|
67
|
+
|
68
|
+
it 'should be possible to use xpath queries' do
|
69
|
+
xpath = '//li/ancestor::article/h1'
|
70
|
+
xpath_content = @test_doc.xpath_content(xpath)
|
71
|
+
xpath_element = @test_doc.xpath(xpath)
|
72
|
+
xpath_content.must_equal('This is an article')
|
73
|
+
xpath_content.must_equal(xpath_element.first.content)
|
74
|
+
end
|
75
|
+
|
76
|
+
it 'should be possible to use regular expressions' do
|
77
|
+
regex = @test_doc.regexp(/\<h1\>([^\<])*/)
|
78
|
+
regex.must_equal(@test_doc.html.match(/\<h1\>([^\<])*/))
|
79
|
+
end
|
80
|
+
|
81
|
+
it 'should be possible to use blocks on query methods' do
|
82
|
+
array = []
|
83
|
+
@test_doc.css('p') do |element|
|
84
|
+
array << element.content
|
85
|
+
end
|
86
|
+
array.last.must_equal('This is the last paragraph')
|
87
|
+
array2 = []
|
88
|
+
@test_doc.xpath('//p') do |element|
|
89
|
+
array2 << element.content
|
90
|
+
end
|
91
|
+
array2.must_equal(array)
|
92
|
+
end
|
93
|
+
|
94
|
+
it 'should warn when providing an empty file' do
|
95
|
+
file = Tempfile.new('empty')
|
96
|
+
file.write('')
|
97
|
+
file.close
|
98
|
+
|
99
|
+
open(file.path).read.empty?.must_equal true
|
100
|
+
_, err = capture_io do
|
101
|
+
# Switch to hijacked IO
|
102
|
+
Log4r::Outputter['docparser'].instance_variable_set(:@out, $stderr)
|
103
|
+
DocParser::Document.new(filename: file.path, parser: @parser)
|
104
|
+
end
|
105
|
+
# Restore IO
|
106
|
+
Log4r::Outputter['docparser'].instance_variable_set(:@out, $stderr)
|
107
|
+
err.must_include "#{file.path} is empty"
|
108
|
+
end
|
109
|
+
|
110
|
+
it 'should add the row to the results' do
|
111
|
+
@test_doc.add_row ['test']
|
112
|
+
@test_doc.add_row 'test', 'test2'
|
113
|
+
@test_doc.results.must_equal [[['test'], ['test', 'test2']]]
|
114
|
+
end
|
115
|
+
|
116
|
+
it 'should be possible to specify outputs directly' do
|
117
|
+
@test_doc.add_row ['test!'], output: $output
|
118
|
+
@test_doc.results.must_equal [[['test!']]]
|
119
|
+
end
|
120
|
+
|
121
|
+
it 'should be possible to use multiple outputs' do
|
122
|
+
output = DocParser::NilOutput.new
|
123
|
+
output2 = DocParser::NilOutput.new
|
124
|
+
parser = Class.new do
|
125
|
+
define_method(:outputs) { [output, output2] }
|
126
|
+
end.new
|
127
|
+
test_doc = DocParser::Document.new(filename: @test_doc_path,
|
128
|
+
parser: parser)
|
129
|
+
test_doc.add_row ['a'], output: 1
|
130
|
+
test_doc.add_row ['b'], output: 0
|
131
|
+
test_doc.results.must_equal [[['b']], [['a']]]
|
132
|
+
end
|
133
|
+
|
134
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
require_relative '../../test_helper'
|
2
|
+
|
3
|
+
describe DocParser do
|
4
|
+
it 'should have a valid logger' do
|
5
|
+
logger = Log4r::Logger['docparser']
|
6
|
+
logger.wont_be_nil
|
7
|
+
logger.must_be_instance_of Log4r::Logger
|
8
|
+
end
|
9
|
+
|
10
|
+
it 'must have the correct loglevel by default' do
|
11
|
+
Log4r::Logger['docparser'].level.must_equal Log4r::INFO
|
12
|
+
end
|
13
|
+
|
14
|
+
it 'should log to the correct output' do
|
15
|
+
outputters = Log4r::Logger['docparser'].outputters
|
16
|
+
outputters.length.must_equal 1
|
17
|
+
outputters.first.must_be_instance_of Log4r::StdoutOutputter
|
18
|
+
end
|
19
|
+
end
|