docparser 0.0.1 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +2 -0
  3. data/.rubocop.yml +5 -0
  4. data/.travis.yml +3 -0
  5. data/Gemfile +9 -1
  6. data/README.md +11 -4
  7. data/Rakefile +15 -0
  8. data/example.rb +9 -7
  9. data/lib/docparser.rb +1 -0
  10. data/lib/docparser/document.rb +18 -11
  11. data/lib/docparser/output.rb +8 -8
  12. data/lib/docparser/output/html_output.rb +53 -47
  13. data/lib/docparser/output/json_output.rb +8 -3
  14. data/lib/docparser/output/multi_output.rb +4 -8
  15. data/lib/docparser/output/nil_output.rb +21 -0
  16. data/lib/docparser/output/screen_output.rb +2 -1
  17. data/lib/docparser/output/xlsx_output.rb +12 -2
  18. data/lib/docparser/output/yaml_output.rb +6 -1
  19. data/lib/docparser/parser.rb +80 -49
  20. data/lib/docparser/version.rb +1 -1
  21. data/test/lib/docparser/blackbox_test.rb +29 -0
  22. data/test/lib/docparser/document_test.rb +134 -0
  23. data/test/lib/docparser/logging_test.rb +19 -0
  24. data/test/lib/docparser/output/csv_output_test.rb +51 -0
  25. data/test/lib/docparser/output/html_output_test.rb +57 -0
  26. data/test/lib/docparser/output/json_output_test.rb +65 -0
  27. data/test/lib/docparser/output/multi_output_test.rb +80 -0
  28. data/test/lib/docparser/output/nil_output_test.rb +27 -0
  29. data/test/lib/docparser/output/screen_output_test.rb +55 -0
  30. data/test/lib/docparser/output/xlsx_output_test.rb +53 -0
  31. data/test/lib/docparser/output/yaml_output_test.rb +76 -0
  32. data/test/lib/docparser/output_test.rb +85 -0
  33. data/test/lib/docparser/parser_test.rb +197 -0
  34. data/test/lib/docparser/version_test.rb +11 -0
  35. data/test/support/hackaday/dl.rb +4 -0
  36. data/test/support/hackaday/file_1.html +716 -0
  37. data/test/support/hackaday/file_10.html +791 -0
  38. data/test/support/hackaday/file_11.html +787 -0
  39. data/test/support/hackaday/file_12.html +715 -0
  40. data/test/support/hackaday/file_13.html +793 -0
  41. data/test/support/hackaday/file_14.html +718 -0
  42. data/test/support/hackaday/file_15.html +707 -0
  43. data/test/support/hackaday/file_16.html +713 -0
  44. data/test/support/hackaday/file_17.html +715 -0
  45. data/test/support/hackaday/file_18.html +725 -0
  46. data/test/support/hackaday/file_19.html +715 -0
  47. data/test/support/hackaday/file_2.html +793 -0
  48. data/test/support/hackaday/file_20.html +795 -0
  49. data/test/support/hackaday/file_21.html +804 -0
  50. data/test/support/hackaday/file_22.html +722 -0
  51. data/test/support/hackaday/file_23.html +793 -0
  52. data/test/support/hackaday/file_24.html +717 -0
  53. data/test/support/hackaday/file_25.html +715 -0
  54. data/test/support/hackaday/file_26.html +717 -0
  55. data/test/support/hackaday/file_27.html +723 -0
  56. data/test/support/hackaday/file_28.html +711 -0
  57. data/test/support/hackaday/file_29.html +711 -0
  58. data/test/support/hackaday/file_3.html +794 -0
  59. data/test/support/hackaday/file_30.html +715 -0
  60. data/test/support/hackaday/file_31.html +713 -0
  61. data/test/support/hackaday/file_32.html +714 -0
  62. data/test/support/hackaday/file_33.html +716 -0
  63. data/test/support/hackaday/file_34.html +714 -0
  64. data/test/support/hackaday/file_35.html +792 -0
  65. data/test/support/hackaday/file_36.html +719 -0
  66. data/test/support/hackaday/file_37.html +712 -0
  67. data/test/support/hackaday/file_38.html +709 -0
  68. data/test/support/hackaday/file_39.html +808 -0
  69. data/test/support/hackaday/file_4.html +814 -0
  70. data/test/support/hackaday/file_40.html +801 -0
  71. data/test/support/hackaday/file_5.html +715 -0
  72. data/test/support/hackaday/file_6.html +792 -0
  73. data/test/support/hackaday/file_7.html +714 -0
  74. data/test/support/hackaday/file_8.html +717 -0
  75. data/test/support/hackaday/file_9.html +719 -0
  76. data/test/support/test_encoding.html +12 -0
  77. data/test/support/test_encoding2.html +12 -0
  78. data/test/support/test_html.html +16 -0
  79. data/test/support/test_xml.xml +5 -0
  80. data/test/test_helper.rb +14 -0
  81. metadata +126 -3
@@ -0,0 +1,21 @@
1
+ module DocParser
2
+ # This Output is used for testing purposes.
3
+
4
+ # @see Output
5
+ class NilOutput < Output
6
+ # @!visibility private
7
+
8
+ def initialize
9
+ @rowcount = 0
10
+ end
11
+
12
+ def close
13
+ end
14
+
15
+ def write_row(row)
16
+ end
17
+
18
+ def add_row(row)
19
+ end
20
+ end
21
+ end
@@ -1,7 +1,7 @@
1
1
  require 'terminal-table'
2
2
  require 'pageme'
3
3
  module DocParser
4
- # This Output is can be used for debugging purposes.
4
+ # This Output can be used for debugging purposes.
5
5
 
6
6
  # It pipes all rows through a pager
7
7
  # @see Output
@@ -25,6 +25,7 @@ module DocParser
25
25
  end
26
26
 
27
27
  def write_row(row)
28
+ raise MissingHeaderException if @header.nil? || @header.length == 0
28
29
  out = []
29
30
  0.upto(@header.length - 1) do |counter|
30
31
  out << [@header[counter], row[counter]]
@@ -20,9 +20,19 @@ module DocParser
20
20
  @sheet.add_row row
21
21
  end
22
22
 
23
- def close
24
- @sheet.add_table "A1:#{@sheet.cells.last.r}", name: 'Data'
23
+ def footer
24
+ unless @header.nil?
25
+ @sheet.add_table "A1:#{@sheet.cells.last.r}", name: 'Data'
26
+ end
25
27
  @package.serialize @filename
26
28
  end
29
+
30
+ def rowcount
31
+ if @header.nil?
32
+ @sheet.rows.length
33
+ else
34
+ @sheet.rows.length - 1
35
+ end
36
+ end
27
37
  end
28
38
  end
@@ -6,9 +6,14 @@ module DocParser
6
6
  class YAMLOutput < Output
7
7
  # @!visibility private
8
8
  def write_row(row)
9
+ raise MissingHeaderException if @header.nil? || @header.length == 0
9
10
  @doc ||= {}
10
11
  0.upto(@header.length - 1) do |counter|
11
- @doc[@header[counter]] = row[counter] rescue ''
12
+ if row.length > counter
13
+ @doc[@header[counter]] = row[counter]
14
+ else
15
+ @doc[@header[counter]] = ''
16
+ end
12
17
  end
13
18
  YAML.dump @doc, @file
14
19
  end
@@ -1,4 +1,4 @@
1
- $:.unshift __dir__
1
+ $LOAD_PATH.unshift __dir__
2
2
  require 'rubygems'
3
3
  require 'bundler/setup'
4
4
  require 'version'
@@ -8,6 +8,8 @@ require 'nokogiri'
8
8
  require 'open-uri'
9
9
  require 'parallel'
10
10
  require 'set'
11
+ require 'log4r'
12
+ require 'log4r/formatter/patternformatter'
11
13
  require 'output/screen_output.rb'
12
14
  require 'output/csv_output.rb'
13
15
  require 'output/html_output.rb'
@@ -15,14 +17,26 @@ require 'output/xlsx_output.rb'
15
17
  require 'output/yaml_output.rb'
16
18
  require 'output/json_output.rb'
17
19
  require 'output/multi_output.rb'
18
- # {include:file:README.md}
20
+ require 'output/nil_output.rb'
21
+
22
+ Log4r.define_levels(*Log4r::Log4rConfig::LogLevels)
23
+ logger = Log4r::Logger.new('docparser')
24
+ output = Log4r::StdoutOutputter.new('docparser')
25
+ output.formatter = Log4r::PatternFormatter.new(pattern: '[%l %C] %d :: %m')
26
+ logger.outputters = output
27
+ logger.level = Log4r::INFO
28
+ logger = nil
29
+ output = nil
30
+
31
+ # The DocParser namespace
32
+ # See README.md for information on using DocParser
19
33
  module DocParser
20
34
  # The main parser class. This is the class you'll use to create your parser
21
35
  # The real work happens in the Document class
22
36
  # @see Document
23
37
  class Parser
24
38
  # @!visibility private
25
- attr_reader :outputs
39
+ attr_reader :outputs, :files, :num_processes, :encoding
26
40
 
27
41
  # Creates a new parser instance
28
42
  # @param files [Array] An array containing URLs or paths to files
@@ -33,73 +47,90 @@ module DocParser
33
47
  # @param range [Range] Range of files to process (nil means process all)
34
48
  # @param num_processes [Fixnum] Number of parallel processes
35
49
  def initialize(files: [], quiet: false, encoding: 'utf-8', parallel: true,
36
- output: ScreenOutput.new, range: nil,
50
+ output: nil, range: nil,
37
51
  num_processes: Parallel.processor_count + 1)
38
- @quiet = quiet
39
- @parallel = parallel
40
- @num_processes = num_processes
52
+ @num_processes = parallel ? num_processes : 1
53
+ @files = range ? files[range] : files
41
54
  @encoding = encoding
42
- if output.is_a? Output
43
- @outputs = []
44
- @outputs << output
45
- elsif output.is_a?(Array) && output.all? { |o| o.is_a? Output }
46
- @outputs = output
47
- else
48
- raise ArgumentError, 'No outputs specified'
49
- end
50
- @files = if range
51
- files[range]
52
- else
53
- files
55
+
56
+ Log4r::Logger['docparser'].level = quiet ? Log4r::ERROR : Log4r::INFO
57
+
58
+ unless output.nil?
59
+ if output.is_a? Output
60
+ @outputs = []
61
+ @outputs << output
62
+ elsif output.is_a?(Array) && output.all? { |o| o.is_a? Output }
63
+ @outputs = output
64
+ else
65
+ raise ArgumentError, 'Invalid outputs specified'
66
+ end
67
+
68
+ @resultsets = Array.new(@outputs.length) { Set.new }
54
69
  end
55
- log 'DocParser loaded..'
56
- log "#{@files.length} files loaded (encoding: #{@encoding})"
70
+
71
+ @logger = Log4r::Logger.new('docparser::parser')
72
+ @logger.info "DocParser v#{VERSION}"
73
+ @logger.info "#{@files.length} files loaded (encoding: #{@encoding})"
57
74
  end
58
75
 
59
76
  #
60
77
  # Parses the `files`
61
78
  #
62
79
  def parse!(&block)
63
- log "Parsing #{@files.length} files."
80
+ @logger.info "Parsing #{@files.length} files."
64
81
  start_time = Time.now
65
- resultsets = Array.new(@outputs.length) { Set.new }
66
-
67
- if @parallel && @num_processes > 1
68
- log "Starting #{@num_processes} processes"
69
- Parallel.map(@files, in_processes: @num_processes) do |file|
70
- Document.new(file, encoding: @encoding, parser: self).parse!(&block)
71
- end.each do |result|
72
- result.each_with_index { |set, index| resultsets[index].merge(set) }
73
- end
74
- log 'Parallel processing finished, writing results..'
82
+
83
+ if @num_processes > 1
84
+ parallel_process(&block)
75
85
  else
76
- @files.each do |file|
77
- doc = Document.new(file, encoding: @encoding, parser: self)
78
- doc.parse!(&block).each_with_index do |set, index|
79
- resultsets[index].merge(set)
80
- end
86
+ serial_process(&block)
87
+ end
88
+
89
+ @logger.info 'Processing finished'
90
+
91
+ write_to_outputs if @outputs
92
+
93
+ @logger.info sprintf('Done processing in %.2fs.', Time.now - start_time)
94
+ end
95
+
96
+ private
97
+
98
+ def parallel_process(&block)
99
+ @logger.info "Starting #{@num_processes} processes"
100
+ Parallel.map(@files, in_processes: @num_processes) do |file|
101
+ # :nocov: #
102
+ parse_doc(file, &block)
103
+ # :nocov: #
104
+ end.each do |result|
105
+ result.each_with_index do |set, index|
106
+ @resultsets[index].merge(set)
107
+ end if @outputs
108
+ end
109
+ end
110
+
111
+ def serial_process(&block)
112
+ @files.each do |file|
113
+ parse_doc(file, &block).each_with_index do |set, index|
114
+ @resultsets[index].merge(set) if @outputs
81
115
  end
82
116
  end
117
+ end
83
118
 
84
- log "\nSummary\n======="
119
+ def parse_doc(file, &block)
120
+ doc = Document.new(filename: file, encoding: @encoding, parser: self)
121
+ doc.parse!(&block)
122
+ end
85
123
 
124
+ def write_to_outputs
125
+ @logger.info 'Writing data..'
86
126
  @outputs.each_with_index do |output, index|
87
- resultsets[index].each do |row|
127
+ @resultsets[index].each do |row|
88
128
  output.add_row row
89
129
  end
90
- resultsets[index] = nil
130
+ @resultsets[index] = nil
91
131
  output.close
92
- log output.summary
93
132
  end
94
-
95
- log ''
96
- log 'Done processing in %.2fs.' % (Time.now - start_time)
97
133
  end
98
134
 
99
- private
100
-
101
- def log(str)
102
- puts str unless @quiet
103
- end
104
135
  end
105
136
  end
@@ -1,4 +1,4 @@
1
1
  module DocParser
2
2
  # The current version of DocParser
3
- VERSION = '0.0.1'
3
+ VERSION = '0.1.0'
4
4
  end
@@ -0,0 +1,29 @@
1
+ require_relative '../../test_helper'
2
+ require 'open3'
3
+ require 'shellwords'
4
+
5
+ def cmd_to_sys(command)
6
+ Open3.popen3(command) do |stdin, stdout, stderr|
7
+ [stdout.read, stderr.read]
8
+ end
9
+ end
10
+
11
+ describe DocParser do
12
+ it 'should run the example without problems' do
13
+ curwd = Dir.getwd
14
+ Dir.mktmpdir do |dir|
15
+ Dir.chdir(dir)
16
+ example_file = Shellwords.escape(File.join($ROOT_DIR, 'example.rb'))
17
+ out, err = cmd_to_sys '/usr/bin/env ruby ' + example_file
18
+ err.must_be_empty
19
+ rows = out.scan(/(\d+) rows/).flatten
20
+ rows.length.must_equal 5
21
+ row_lengths = rows.group_by { |elem| elem.to_i }
22
+ row_lengths.length.must_equal 1
23
+ # HaD: 40 pages of 7 articles
24
+ row_lengths.keys.first.must_equal(7 * 40)
25
+ out.must_match(/Done processing/)
26
+ end
27
+ Dir.chdir(curwd)
28
+ end
29
+ end
@@ -0,0 +1,134 @@
1
+ require_relative '../../test_helper'
2
+ describe DocParser::Document do
3
+ before do
4
+ Log4r::Logger['docparser'].level = Log4r::INFO
5
+ $output = DocParser::NilOutput.new
6
+ @parser = Class.new do
7
+ define_method(:outputs) { [$output] }
8
+ end.new
9
+ @test_doc_path = File.join($SUPPORT_DIR, 'test_html.html')
10
+ @test_doc = DocParser::Document.new(filename: @test_doc_path,
11
+ parser: @parser)
12
+ end
13
+
14
+ it 'should read HTML contents' do
15
+ file = File.join($SUPPORT_DIR, 'test_html.html')
16
+ doc = DocParser::Document.new(filename: file, parser: @parser)
17
+ doc.doc.must_be_instance_of Nokogiri::HTML::Document
18
+ doc.html.must_equal(open(file).read)
19
+ end
20
+
21
+ it 'should read XML contents' do
22
+ file = File.join($SUPPORT_DIR, 'test_xml.xml')
23
+ doc = DocParser::Document.new(filename: file, parser: @parser)
24
+ doc.doc.must_be_instance_of Nokogiri::XML::Document
25
+ doc.html.must_equal(open(file).read)
26
+ doc.xpath_content('xmltest > title').must_equal('Test XML')
27
+ doc.xpath_content('xmltest > test').must_equal('Character Data')
28
+ end
29
+
30
+ it 'should read remote contents' do
31
+ url = 'https://gist.github.com/jurriaan/3f2750aa546e3e6719cf/raw'
32
+ doc = DocParser::Document.new(filename: url, parser: @parser)
33
+ doc.html.must_equal(open(url).read)
34
+ end
35
+
36
+ it 'should use the correct encoding' do
37
+ file = File.join($SUPPORT_DIR, 'test_encoding.html')
38
+ file2 = File.join($SUPPORT_DIR, 'test_encoding2.html')
39
+ doc = DocParser::Document.new(filename: file, parser: @parser)
40
+ doc2 = DocParser::Document.new(filename: file2,
41
+ parser: @parser,
42
+ encoding: 'iso-8859-1')
43
+ doc.html.must_equal(doc2.html)
44
+ doc.css_content('#encoding').must_equal(doc2.css_content('#encoding'))
45
+ end
46
+
47
+ it 'should specify filename and encoding in #inspect' do
48
+ @test_doc.inspect.must_include(@test_doc.filename)
49
+ @test_doc.inspect.must_include(@test_doc.encoding)
50
+ end
51
+
52
+ it 'should get the title of a document' do
53
+ @test_doc.title.must_equal('Test HTML')
54
+ end
55
+
56
+ it 'should store the path to the document' do
57
+ @test_doc.filename.must_equal(@test_doc_path)
58
+ end
59
+
60
+ it 'should be possible to use css queries' do
61
+ css = 'article > h1 + p'
62
+ css_content = @test_doc.css_content(css)
63
+ css_element = @test_doc.css(css)
64
+ css_content.must_equal('Great article it is')
65
+ css_content.must_equal(css_element.first.content)
66
+ end
67
+
68
+ it 'should be possible to use xpath queries' do
69
+ xpath = '//li/ancestor::article/h1'
70
+ xpath_content = @test_doc.xpath_content(xpath)
71
+ xpath_element = @test_doc.xpath(xpath)
72
+ xpath_content.must_equal('This is an article')
73
+ xpath_content.must_equal(xpath_element.first.content)
74
+ end
75
+
76
+ it 'should be possible to use regular expressions' do
77
+ regex = @test_doc.regexp(/\<h1\>([^\<])*/)
78
+ regex.must_equal(@test_doc.html.match(/\<h1\>([^\<])*/))
79
+ end
80
+
81
+ it 'should be possible to use blocks on query methods' do
82
+ array = []
83
+ @test_doc.css('p') do |element|
84
+ array << element.content
85
+ end
86
+ array.last.must_equal('This is the last paragraph')
87
+ array2 = []
88
+ @test_doc.xpath('//p') do |element|
89
+ array2 << element.content
90
+ end
91
+ array2.must_equal(array)
92
+ end
93
+
94
+ it 'should warn when providing an empty file' do
95
+ file = Tempfile.new('empty')
96
+ file.write('')
97
+ file.close
98
+
99
+ open(file.path).read.empty?.must_equal true
100
+ _, err = capture_io do
101
+ # Switch to hijacked IO
102
+ Log4r::Outputter['docparser'].instance_variable_set(:@out, $stderr)
103
+ DocParser::Document.new(filename: file.path, parser: @parser)
104
+ end
105
+ # Restore IO
106
+ Log4r::Outputter['docparser'].instance_variable_set(:@out, $stderr)
107
+ err.must_include "#{file.path} is empty"
108
+ end
109
+
110
+ it 'should add the row to the results' do
111
+ @test_doc.add_row ['test']
112
+ @test_doc.add_row 'test', 'test2'
113
+ @test_doc.results.must_equal [[['test'], ['test', 'test2']]]
114
+ end
115
+
116
+ it 'should be possible to specify outputs directly' do
117
+ @test_doc.add_row ['test!'], output: $output
118
+ @test_doc.results.must_equal [[['test!']]]
119
+ end
120
+
121
+ it 'should be possible to use multiple outputs' do
122
+ output = DocParser::NilOutput.new
123
+ output2 = DocParser::NilOutput.new
124
+ parser = Class.new do
125
+ define_method(:outputs) { [output, output2] }
126
+ end.new
127
+ test_doc = DocParser::Document.new(filename: @test_doc_path,
128
+ parser: parser)
129
+ test_doc.add_row ['a'], output: 1
130
+ test_doc.add_row ['b'], output: 0
131
+ test_doc.results.must_equal [[['b']], [['a']]]
132
+ end
133
+
134
+ end
@@ -0,0 +1,19 @@
1
+ require_relative '../../test_helper'
2
+
3
+ describe DocParser do
4
+ it 'should have a valid logger' do
5
+ logger = Log4r::Logger['docparser']
6
+ logger.wont_be_nil
7
+ logger.must_be_instance_of Log4r::Logger
8
+ end
9
+
10
+ it 'must have the correct loglevel by default' do
11
+ Log4r::Logger['docparser'].level.must_equal Log4r::INFO
12
+ end
13
+
14
+ it 'should log to the correct output' do
15
+ outputters = Log4r::Logger['docparser'].outputters
16
+ outputters.length.must_equal 1
17
+ outputters.first.must_be_instance_of Log4r::StdoutOutputter
18
+ end
19
+ end