docparser 0.0.1 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (81) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +2 -0
  3. data/.rubocop.yml +5 -0
  4. data/.travis.yml +3 -0
  5. data/Gemfile +9 -1
  6. data/README.md +11 -4
  7. data/Rakefile +15 -0
  8. data/example.rb +9 -7
  9. data/lib/docparser.rb +1 -0
  10. data/lib/docparser/document.rb +18 -11
  11. data/lib/docparser/output.rb +8 -8
  12. data/lib/docparser/output/html_output.rb +53 -47
  13. data/lib/docparser/output/json_output.rb +8 -3
  14. data/lib/docparser/output/multi_output.rb +4 -8
  15. data/lib/docparser/output/nil_output.rb +21 -0
  16. data/lib/docparser/output/screen_output.rb +2 -1
  17. data/lib/docparser/output/xlsx_output.rb +12 -2
  18. data/lib/docparser/output/yaml_output.rb +6 -1
  19. data/lib/docparser/parser.rb +80 -49
  20. data/lib/docparser/version.rb +1 -1
  21. data/test/lib/docparser/blackbox_test.rb +29 -0
  22. data/test/lib/docparser/document_test.rb +134 -0
  23. data/test/lib/docparser/logging_test.rb +19 -0
  24. data/test/lib/docparser/output/csv_output_test.rb +51 -0
  25. data/test/lib/docparser/output/html_output_test.rb +57 -0
  26. data/test/lib/docparser/output/json_output_test.rb +65 -0
  27. data/test/lib/docparser/output/multi_output_test.rb +80 -0
  28. data/test/lib/docparser/output/nil_output_test.rb +27 -0
  29. data/test/lib/docparser/output/screen_output_test.rb +55 -0
  30. data/test/lib/docparser/output/xlsx_output_test.rb +53 -0
  31. data/test/lib/docparser/output/yaml_output_test.rb +76 -0
  32. data/test/lib/docparser/output_test.rb +85 -0
  33. data/test/lib/docparser/parser_test.rb +197 -0
  34. data/test/lib/docparser/version_test.rb +11 -0
  35. data/test/support/hackaday/dl.rb +4 -0
  36. data/test/support/hackaday/file_1.html +716 -0
  37. data/test/support/hackaday/file_10.html +791 -0
  38. data/test/support/hackaday/file_11.html +787 -0
  39. data/test/support/hackaday/file_12.html +715 -0
  40. data/test/support/hackaday/file_13.html +793 -0
  41. data/test/support/hackaday/file_14.html +718 -0
  42. data/test/support/hackaday/file_15.html +707 -0
  43. data/test/support/hackaday/file_16.html +713 -0
  44. data/test/support/hackaday/file_17.html +715 -0
  45. data/test/support/hackaday/file_18.html +725 -0
  46. data/test/support/hackaday/file_19.html +715 -0
  47. data/test/support/hackaday/file_2.html +793 -0
  48. data/test/support/hackaday/file_20.html +795 -0
  49. data/test/support/hackaday/file_21.html +804 -0
  50. data/test/support/hackaday/file_22.html +722 -0
  51. data/test/support/hackaday/file_23.html +793 -0
  52. data/test/support/hackaday/file_24.html +717 -0
  53. data/test/support/hackaday/file_25.html +715 -0
  54. data/test/support/hackaday/file_26.html +717 -0
  55. data/test/support/hackaday/file_27.html +723 -0
  56. data/test/support/hackaday/file_28.html +711 -0
  57. data/test/support/hackaday/file_29.html +711 -0
  58. data/test/support/hackaday/file_3.html +794 -0
  59. data/test/support/hackaday/file_30.html +715 -0
  60. data/test/support/hackaday/file_31.html +713 -0
  61. data/test/support/hackaday/file_32.html +714 -0
  62. data/test/support/hackaday/file_33.html +716 -0
  63. data/test/support/hackaday/file_34.html +714 -0
  64. data/test/support/hackaday/file_35.html +792 -0
  65. data/test/support/hackaday/file_36.html +719 -0
  66. data/test/support/hackaday/file_37.html +712 -0
  67. data/test/support/hackaday/file_38.html +709 -0
  68. data/test/support/hackaday/file_39.html +808 -0
  69. data/test/support/hackaday/file_4.html +814 -0
  70. data/test/support/hackaday/file_40.html +801 -0
  71. data/test/support/hackaday/file_5.html +715 -0
  72. data/test/support/hackaday/file_6.html +792 -0
  73. data/test/support/hackaday/file_7.html +714 -0
  74. data/test/support/hackaday/file_8.html +717 -0
  75. data/test/support/hackaday/file_9.html +719 -0
  76. data/test/support/test_encoding.html +12 -0
  77. data/test/support/test_encoding2.html +12 -0
  78. data/test/support/test_html.html +16 -0
  79. data/test/support/test_xml.xml +5 -0
  80. data/test/test_helper.rb +14 -0
  81. metadata +126 -3
@@ -0,0 +1,21 @@
1
+ module DocParser
2
+ # This Output is used for testing purposes.
3
+
4
+ # @see Output
5
+ class NilOutput < Output
6
+ # @!visibility private
7
+
8
+ def initialize
9
+ @rowcount = 0
10
+ end
11
+
12
+ def close
13
+ end
14
+
15
+ def write_row(row)
16
+ end
17
+
18
+ def add_row(row)
19
+ end
20
+ end
21
+ end
@@ -1,7 +1,7 @@
1
1
  require 'terminal-table'
2
2
  require 'pageme'
3
3
  module DocParser
4
- # This Output is can be used for debugging purposes.
4
+ # This Output can be used for debugging purposes.
5
5
 
6
6
  # It pipes all rows through a pager
7
7
  # @see Output
@@ -25,6 +25,7 @@ module DocParser
25
25
  end
26
26
 
27
27
  def write_row(row)
28
+ raise MissingHeaderException if @header.nil? || @header.length == 0
28
29
  out = []
29
30
  0.upto(@header.length - 1) do |counter|
30
31
  out << [@header[counter], row[counter]]
@@ -20,9 +20,19 @@ module DocParser
20
20
  @sheet.add_row row
21
21
  end
22
22
 
23
- def close
24
- @sheet.add_table "A1:#{@sheet.cells.last.r}", name: 'Data'
23
+ def footer
24
+ unless @header.nil?
25
+ @sheet.add_table "A1:#{@sheet.cells.last.r}", name: 'Data'
26
+ end
25
27
  @package.serialize @filename
26
28
  end
29
+
30
+ def rowcount
31
+ if @header.nil?
32
+ @sheet.rows.length
33
+ else
34
+ @sheet.rows.length - 1
35
+ end
36
+ end
27
37
  end
28
38
  end
@@ -6,9 +6,14 @@ module DocParser
6
6
  class YAMLOutput < Output
7
7
  # @!visibility private
8
8
  def write_row(row)
9
+ raise MissingHeaderException if @header.nil? || @header.length == 0
9
10
  @doc ||= {}
10
11
  0.upto(@header.length - 1) do |counter|
11
- @doc[@header[counter]] = row[counter] rescue ''
12
+ if row.length > counter
13
+ @doc[@header[counter]] = row[counter]
14
+ else
15
+ @doc[@header[counter]] = ''
16
+ end
12
17
  end
13
18
  YAML.dump @doc, @file
14
19
  end
@@ -1,4 +1,4 @@
1
- $:.unshift __dir__
1
+ $LOAD_PATH.unshift __dir__
2
2
  require 'rubygems'
3
3
  require 'bundler/setup'
4
4
  require 'version'
@@ -8,6 +8,8 @@ require 'nokogiri'
8
8
  require 'open-uri'
9
9
  require 'parallel'
10
10
  require 'set'
11
+ require 'log4r'
12
+ require 'log4r/formatter/patternformatter'
11
13
  require 'output/screen_output.rb'
12
14
  require 'output/csv_output.rb'
13
15
  require 'output/html_output.rb'
@@ -15,14 +17,26 @@ require 'output/xlsx_output.rb'
15
17
  require 'output/yaml_output.rb'
16
18
  require 'output/json_output.rb'
17
19
  require 'output/multi_output.rb'
18
- # {include:file:README.md}
20
+ require 'output/nil_output.rb'
21
+
22
+ Log4r.define_levels(*Log4r::Log4rConfig::LogLevels)
23
+ logger = Log4r::Logger.new('docparser')
24
+ output = Log4r::StdoutOutputter.new('docparser')
25
+ output.formatter = Log4r::PatternFormatter.new(pattern: '[%l %C] %d :: %m')
26
+ logger.outputters = output
27
+ logger.level = Log4r::INFO
28
+ logger = nil
29
+ output = nil
30
+
31
+ # The DocParser namespace
32
+ # See README.md for information on using DocParser
19
33
  module DocParser
20
34
  # The main parser class. This is the class you'll use to create your parser
21
35
  # The real work happens in the Document class
22
36
  # @see Document
23
37
  class Parser
24
38
  # @!visibility private
25
- attr_reader :outputs
39
+ attr_reader :outputs, :files, :num_processes, :encoding
26
40
 
27
41
  # Creates a new parser instance
28
42
  # @param files [Array] An array containing URLs or paths to files
@@ -33,73 +47,90 @@ module DocParser
33
47
  # @param range [Range] Range of files to process (nil means process all)
34
48
  # @param num_processes [Fixnum] Number of parallel processes
35
49
  def initialize(files: [], quiet: false, encoding: 'utf-8', parallel: true,
36
- output: ScreenOutput.new, range: nil,
50
+ output: nil, range: nil,
37
51
  num_processes: Parallel.processor_count + 1)
38
- @quiet = quiet
39
- @parallel = parallel
40
- @num_processes = num_processes
52
+ @num_processes = parallel ? num_processes : 1
53
+ @files = range ? files[range] : files
41
54
  @encoding = encoding
42
- if output.is_a? Output
43
- @outputs = []
44
- @outputs << output
45
- elsif output.is_a?(Array) && output.all? { |o| o.is_a? Output }
46
- @outputs = output
47
- else
48
- raise ArgumentError, 'No outputs specified'
49
- end
50
- @files = if range
51
- files[range]
52
- else
53
- files
55
+
56
+ Log4r::Logger['docparser'].level = quiet ? Log4r::ERROR : Log4r::INFO
57
+
58
+ unless output.nil?
59
+ if output.is_a? Output
60
+ @outputs = []
61
+ @outputs << output
62
+ elsif output.is_a?(Array) && output.all? { |o| o.is_a? Output }
63
+ @outputs = output
64
+ else
65
+ raise ArgumentError, 'Invalid outputs specified'
66
+ end
67
+
68
+ @resultsets = Array.new(@outputs.length) { Set.new }
54
69
  end
55
- log 'DocParser loaded..'
56
- log "#{@files.length} files loaded (encoding: #{@encoding})"
70
+
71
+ @logger = Log4r::Logger.new('docparser::parser')
72
+ @logger.info "DocParser v#{VERSION}"
73
+ @logger.info "#{@files.length} files loaded (encoding: #{@encoding})"
57
74
  end
58
75
 
59
76
  #
60
77
  # Parses the `files`
61
78
  #
62
79
  def parse!(&block)
63
- log "Parsing #{@files.length} files."
80
+ @logger.info "Parsing #{@files.length} files."
64
81
  start_time = Time.now
65
- resultsets = Array.new(@outputs.length) { Set.new }
66
-
67
- if @parallel && @num_processes > 1
68
- log "Starting #{@num_processes} processes"
69
- Parallel.map(@files, in_processes: @num_processes) do |file|
70
- Document.new(file, encoding: @encoding, parser: self).parse!(&block)
71
- end.each do |result|
72
- result.each_with_index { |set, index| resultsets[index].merge(set) }
73
- end
74
- log 'Parallel processing finished, writing results..'
82
+
83
+ if @num_processes > 1
84
+ parallel_process(&block)
75
85
  else
76
- @files.each do |file|
77
- doc = Document.new(file, encoding: @encoding, parser: self)
78
- doc.parse!(&block).each_with_index do |set, index|
79
- resultsets[index].merge(set)
80
- end
86
+ serial_process(&block)
87
+ end
88
+
89
+ @logger.info 'Processing finished'
90
+
91
+ write_to_outputs if @outputs
92
+
93
+ @logger.info sprintf('Done processing in %.2fs.', Time.now - start_time)
94
+ end
95
+
96
+ private
97
+
98
+ def parallel_process(&block)
99
+ @logger.info "Starting #{@num_processes} processes"
100
+ Parallel.map(@files, in_processes: @num_processes) do |file|
101
+ # :nocov: #
102
+ parse_doc(file, &block)
103
+ # :nocov: #
104
+ end.each do |result|
105
+ result.each_with_index do |set, index|
106
+ @resultsets[index].merge(set)
107
+ end if @outputs
108
+ end
109
+ end
110
+
111
+ def serial_process(&block)
112
+ @files.each do |file|
113
+ parse_doc(file, &block).each_with_index do |set, index|
114
+ @resultsets[index].merge(set) if @outputs
81
115
  end
82
116
  end
117
+ end
83
118
 
84
- log "\nSummary\n======="
119
+ def parse_doc(file, &block)
120
+ doc = Document.new(filename: file, encoding: @encoding, parser: self)
121
+ doc.parse!(&block)
122
+ end
85
123
 
124
+ def write_to_outputs
125
+ @logger.info 'Writing data..'
86
126
  @outputs.each_with_index do |output, index|
87
- resultsets[index].each do |row|
127
+ @resultsets[index].each do |row|
88
128
  output.add_row row
89
129
  end
90
- resultsets[index] = nil
130
+ @resultsets[index] = nil
91
131
  output.close
92
- log output.summary
93
132
  end
94
-
95
- log ''
96
- log 'Done processing in %.2fs.' % (Time.now - start_time)
97
133
  end
98
134
 
99
- private
100
-
101
- def log(str)
102
- puts str unless @quiet
103
- end
104
135
  end
105
136
  end
@@ -1,4 +1,4 @@
1
1
  module DocParser
2
2
  # The current version of DocParser
3
- VERSION = '0.0.1'
3
+ VERSION = '0.1.0'
4
4
  end
@@ -0,0 +1,29 @@
1
+ require_relative '../../test_helper'
2
+ require 'open3'
3
+ require 'shellwords'
4
+
5
+ def cmd_to_sys(command)
6
+ Open3.popen3(command) do |stdin, stdout, stderr|
7
+ [stdout.read, stderr.read]
8
+ end
9
+ end
10
+
11
+ describe DocParser do
12
+ it 'should run the example without problems' do
13
+ curwd = Dir.getwd
14
+ Dir.mktmpdir do |dir|
15
+ Dir.chdir(dir)
16
+ example_file = Shellwords.escape(File.join($ROOT_DIR, 'example.rb'))
17
+ out, err = cmd_to_sys '/usr/bin/env ruby ' + example_file
18
+ err.must_be_empty
19
+ rows = out.scan(/(\d+) rows/).flatten
20
+ rows.length.must_equal 5
21
+ row_lengths = rows.group_by { |elem| elem.to_i }
22
+ row_lengths.length.must_equal 1
23
+ # HaD: 40 pages of 7 articles
24
+ row_lengths.keys.first.must_equal(7 * 40)
25
+ out.must_match(/Done processing/)
26
+ end
27
+ Dir.chdir(curwd)
28
+ end
29
+ end
@@ -0,0 +1,134 @@
1
+ require_relative '../../test_helper'
2
+ describe DocParser::Document do
3
+ before do
4
+ Log4r::Logger['docparser'].level = Log4r::INFO
5
+ $output = DocParser::NilOutput.new
6
+ @parser = Class.new do
7
+ define_method(:outputs) { [$output] }
8
+ end.new
9
+ @test_doc_path = File.join($SUPPORT_DIR, 'test_html.html')
10
+ @test_doc = DocParser::Document.new(filename: @test_doc_path,
11
+ parser: @parser)
12
+ end
13
+
14
+ it 'should read HTML contents' do
15
+ file = File.join($SUPPORT_DIR, 'test_html.html')
16
+ doc = DocParser::Document.new(filename: file, parser: @parser)
17
+ doc.doc.must_be_instance_of Nokogiri::HTML::Document
18
+ doc.html.must_equal(open(file).read)
19
+ end
20
+
21
+ it 'should read XML contents' do
22
+ file = File.join($SUPPORT_DIR, 'test_xml.xml')
23
+ doc = DocParser::Document.new(filename: file, parser: @parser)
24
+ doc.doc.must_be_instance_of Nokogiri::XML::Document
25
+ doc.html.must_equal(open(file).read)
26
+ doc.xpath_content('xmltest > title').must_equal('Test XML')
27
+ doc.xpath_content('xmltest > test').must_equal('Character Data')
28
+ end
29
+
30
+ it 'should read remote contents' do
31
+ url = 'https://gist.github.com/jurriaan/3f2750aa546e3e6719cf/raw'
32
+ doc = DocParser::Document.new(filename: url, parser: @parser)
33
+ doc.html.must_equal(open(url).read)
34
+ end
35
+
36
+ it 'should use the correct encoding' do
37
+ file = File.join($SUPPORT_DIR, 'test_encoding.html')
38
+ file2 = File.join($SUPPORT_DIR, 'test_encoding2.html')
39
+ doc = DocParser::Document.new(filename: file, parser: @parser)
40
+ doc2 = DocParser::Document.new(filename: file2,
41
+ parser: @parser,
42
+ encoding: 'iso-8859-1')
43
+ doc.html.must_equal(doc2.html)
44
+ doc.css_content('#encoding').must_equal(doc2.css_content('#encoding'))
45
+ end
46
+
47
+ it 'should specify filename and encoding in #inspect' do
48
+ @test_doc.inspect.must_include(@test_doc.filename)
49
+ @test_doc.inspect.must_include(@test_doc.encoding)
50
+ end
51
+
52
+ it 'should get the title of a document' do
53
+ @test_doc.title.must_equal('Test HTML')
54
+ end
55
+
56
+ it 'should store the path to the document' do
57
+ @test_doc.filename.must_equal(@test_doc_path)
58
+ end
59
+
60
+ it 'should be possible to use css queries' do
61
+ css = 'article > h1 + p'
62
+ css_content = @test_doc.css_content(css)
63
+ css_element = @test_doc.css(css)
64
+ css_content.must_equal('Great article it is')
65
+ css_content.must_equal(css_element.first.content)
66
+ end
67
+
68
+ it 'should be possible to use xpath queries' do
69
+ xpath = '//li/ancestor::article/h1'
70
+ xpath_content = @test_doc.xpath_content(xpath)
71
+ xpath_element = @test_doc.xpath(xpath)
72
+ xpath_content.must_equal('This is an article')
73
+ xpath_content.must_equal(xpath_element.first.content)
74
+ end
75
+
76
+ it 'should be possible to use regular expressions' do
77
+ regex = @test_doc.regexp(/\<h1\>([^\<])*/)
78
+ regex.must_equal(@test_doc.html.match(/\<h1\>([^\<])*/))
79
+ end
80
+
81
+ it 'should be possible to use blocks on query methods' do
82
+ array = []
83
+ @test_doc.css('p') do |element|
84
+ array << element.content
85
+ end
86
+ array.last.must_equal('This is the last paragraph')
87
+ array2 = []
88
+ @test_doc.xpath('//p') do |element|
89
+ array2 << element.content
90
+ end
91
+ array2.must_equal(array)
92
+ end
93
+
94
+ it 'should warn when providing an empty file' do
95
+ file = Tempfile.new('empty')
96
+ file.write('')
97
+ file.close
98
+
99
+ open(file.path).read.empty?.must_equal true
100
+ _, err = capture_io do
101
+ # Switch to hijacked IO
102
+ Log4r::Outputter['docparser'].instance_variable_set(:@out, $stderr)
103
+ DocParser::Document.new(filename: file.path, parser: @parser)
104
+ end
105
+ # Restore IO
106
+ Log4r::Outputter['docparser'].instance_variable_set(:@out, $stderr)
107
+ err.must_include "#{file.path} is empty"
108
+ end
109
+
110
+ it 'should add the row to the results' do
111
+ @test_doc.add_row ['test']
112
+ @test_doc.add_row 'test', 'test2'
113
+ @test_doc.results.must_equal [[['test'], ['test', 'test2']]]
114
+ end
115
+
116
+ it 'should be possible to specify outputs directly' do
117
+ @test_doc.add_row ['test!'], output: $output
118
+ @test_doc.results.must_equal [[['test!']]]
119
+ end
120
+
121
+ it 'should be possible to use multiple outputs' do
122
+ output = DocParser::NilOutput.new
123
+ output2 = DocParser::NilOutput.new
124
+ parser = Class.new do
125
+ define_method(:outputs) { [output, output2] }
126
+ end.new
127
+ test_doc = DocParser::Document.new(filename: @test_doc_path,
128
+ parser: parser)
129
+ test_doc.add_row ['a'], output: 1
130
+ test_doc.add_row ['b'], output: 0
131
+ test_doc.results.must_equal [[['b']], [['a']]]
132
+ end
133
+
134
+ end
@@ -0,0 +1,19 @@
1
+ require_relative '../../test_helper'
2
+
3
+ describe DocParser do
4
+ it 'should have a valid logger' do
5
+ logger = Log4r::Logger['docparser']
6
+ logger.wont_be_nil
7
+ logger.must_be_instance_of Log4r::Logger
8
+ end
9
+
10
+ it 'must have the correct loglevel by default' do
11
+ Log4r::Logger['docparser'].level.must_equal Log4r::INFO
12
+ end
13
+
14
+ it 'should log to the correct output' do
15
+ outputters = Log4r::Logger['docparser'].outputters
16
+ outputters.length.must_equal 1
17
+ outputters.first.must_be_instance_of Log4r::StdoutOutputter
18
+ end
19
+ end