docparser 0.2.3 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. checksums.yaml +5 -5
  2. data/.rubocop.yml +15 -3
  3. data/.rubocop_todo.yml +45 -0
  4. data/.travis.yml +1 -1
  5. data/Gemfile +5 -4
  6. data/README.md +2 -2
  7. data/Rakefile +3 -1
  8. data/docparser.gemspec +9 -9
  9. data/example.rb +2 -0
  10. data/lib/docparser.rb +2 -0
  11. data/lib/docparser/document.rb +20 -10
  12. data/lib/docparser/output.rb +9 -6
  13. data/lib/docparser/output/csv_output.rb +2 -0
  14. data/lib/docparser/output/html_output.rb +52 -49
  15. data/lib/docparser/output/json_output.rb +3 -1
  16. data/lib/docparser/output/multi_output.rb +3 -1
  17. data/lib/docparser/output/nil_output.rb +5 -6
  18. data/lib/docparser/output/xlsx_output.rb +2 -0
  19. data/lib/docparser/output/yaml_output.rb +4 -1
  20. data/lib/docparser/parser.rb +9 -13
  21. data/lib/docparser/version.rb +3 -1
  22. data/test/.rubocop.yml +6 -2
  23. data/test/.rubocop_todo.yml +23 -0
  24. data/test/lib/docparser/blackbox_test.rb +5 -4
  25. data/test/lib/docparser/document_test.rb +19 -14
  26. data/test/lib/docparser/output/csv_output_test.rb +5 -10
  27. data/test/lib/docparser/output/html_output_test.rb +5 -10
  28. data/test/lib/docparser/output/json_output_test.rb +8 -13
  29. data/test/lib/docparser/output/multi_output_test.rb +6 -12
  30. data/test/lib/docparser/output/nil_output_test.rb +4 -9
  31. data/test/lib/docparser/output/xlsx_output_test.rb +5 -10
  32. data/test/lib/docparser/output/yaml_output_test.rb +22 -27
  33. data/test/lib/docparser/output_test.rb +3 -8
  34. data/test/lib/docparser/parser_test.rb +2 -22
  35. data/test/lib/docparser/version_test.rb +2 -0
  36. data/test/support/hackaday/dl.rb +2 -0
  37. data/test/test_helper.rb +2 -3
  38. metadata +20 -35
  39. data/test/lib/docparser/logging_test.rb +0 -19
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 69310d210a064254c5c767509118e5f9d4a1bc83
4
- data.tar.gz: 2e39c9e8c9b8833c17277847fc89b460173de70c
2
+ SHA256:
3
+ metadata.gz: 3a857af8eb8403f2f26f867f02e13319fcb173c6f177947986d5fe81e7248a74
4
+ data.tar.gz: 53e66974c57f2662606f1125a0466ca0ce7db476cae0516da71837b3703263e4
5
5
  SHA512:
6
- metadata.gz: 464c4e463de25d0b961476253f54cf13718a2129bab265382d5a3bc34d90a5ecb9d24eead1718688a485b5c9405cec0f6316b7cd2e7741446f3a253341e2cdb2
7
- data.tar.gz: 682813ef09039f64e4285c4db5986f7c2be6387e7df1f608e6294804e583a3d7c11c5427901d093cacdab03f8afcd9f6f0a39ab77f90df395ac7993e77cc28db
6
+ metadata.gz: 70c5721acae9866e862a6747a6749c894bb38dd20792545ad372b8f5265f9a4ca2f5a895e30fe36fc17eb2e511ffe86ec837974d2430ee1864b2b6773ce08b6c
7
+ data.tar.gz: 0b7cd82a97ad79b78bebab10319403cb6cd9cbee27137912315ede6ccbaa432f87d9a9267123724021f119c9f3d67de73dd57350455d3c00bd63132047b1f57c
@@ -1,6 +1,18 @@
1
+ inherit_from: .rubocop_todo.yml
2
+
1
3
  # Temporary turn this off
2
4
  # Avoid parameter lists longer than three or four parameters.
3
- ParameterLists:
5
+ Metrics/ParameterLists:
4
6
  Enabled: false
5
- MultilineBlockChain:
6
- Enabled: false
7
+ Style/MultilineBlockChain:
8
+ Enabled: false
9
+ Style/HashTransformKeys:
10
+ Enabled: true
11
+ Style/HashTransformValues:
12
+ Enabled: true
13
+ Style/HashEachMethods:
14
+ Enabled: true
15
+ Lint/RaiseException:
16
+ Enabled: true
17
+ Lint/StructNewOverride:
18
+ Enabled: true
@@ -0,0 +1,45 @@
1
+ # This configuration was generated by
2
+ # `rubocop --auto-gen-config`
3
+ # on 2020-04-13 17:55:59 +0200 using RuboCop version 0.81.0.
4
+ # The point is for the user to remove these configuration records
5
+ # one by one as the offenses are removed from the code base.
6
+ # Note that changes in the inspected code, or installation of new
7
+ # versions of RuboCop, may require this file to be generated again.
8
+
9
+ # Offense count: 9
10
+ # Configuration parameters: CountComments, ExcludedMethods.
11
+ # ExcludedMethods: refine
12
+ Metrics/BlockLength:
13
+ Max: 173
14
+
15
+ # Offense count: 2
16
+ # Configuration parameters: ForbiddenDelimiters.
17
+ # ForbiddenDelimiters: (?-mix:(^|\s)(EO[A-Z]{1}|END)(\s|$))
18
+ Naming/HeredocDelimiterNaming:
19
+ Exclude:
20
+ - 'lib/docparser/output/html_output.rb'
21
+
22
+ # Offense count: 16
23
+ Security/Open:
24
+ Exclude:
25
+ - 'lib/docparser/document.rb'
26
+ - 'lib/docparser/output.rb'
27
+ - 'test/lib/docparser/document_test.rb'
28
+ - 'test/lib/docparser/output/csv_output_test.rb'
29
+ - 'test/lib/docparser/output/html_output_test.rb'
30
+ - 'test/lib/docparser/output/json_output_test.rb'
31
+ - 'test/lib/docparser/output/multi_output_test.rb'
32
+ - 'test/lib/docparser/output/yaml_output_test.rb'
33
+
34
+ # Offense count: 4
35
+ # Configuration parameters: EnforcedStyle.
36
+ # SupportedStyles: annotated, template, unannotated
37
+ Style/FormatStringToken:
38
+ Exclude:
39
+ - 'lib/docparser/output.rb'
40
+ - 'lib/docparser/parser.rb'
41
+
42
+ # Offense count: 1
43
+ Style/MixinUsage:
44
+ Exclude:
45
+ - 'example.rb'
@@ -1,7 +1,7 @@
1
1
  language: ruby
2
2
  rvm:
3
3
  - 2.0.0
4
- - 2.1.1
4
+ - 2.1.2
5
5
  - rbx-2.2.5
6
6
  - ruby-head
7
7
  - jruby-head
data/Gemfile CHANGED
@@ -1,12 +1,13 @@
1
+ # frozen_string_literal: true
2
+
1
3
  gemspec
2
4
 
3
5
  source 'https://rubygems.org'
4
6
 
5
7
  group :test do
6
- gem 'minitest', '~> 5.4.1'
7
- gem 'coveralls', require: false
8
+ gem 'minitest', '~> 5.14.0'
8
9
  gem 'rake'
9
- gem 'rubocop', '~> 0.26.0'
10
- gem 'simplecov', require: false
10
+ gem 'rubocop', '~> 0.81.0'
11
11
  gem 'simple_mock'
12
+ gem 'simplecov', require: false
12
13
  end
data/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # DocParser
2
2
 
3
- [![Gem Version](https://badge.fury.io/rb/docparser.png)](http://badge.fury.io/rb/docparser) [![Build Status](https://travis-ci.org/jurriaan/docparser.png?branch=master)](https://travis-ci.org/jurriaan/docparser) [![Dependency Status](https://gemnasium.com/jurriaan/docparser.png)](https://gemnasium.com/jurriaan/docparser) [![Coverage Status](https://coveralls.io/repos/jurriaan/docparser/badge.png?branch=master)](https://coveralls.io/r/jurriaan/docparser)
3
+ [![Gem Version](http://img.shields.io/gem/v/docparser.svg)](http://badge.fury.io/rb/docparser) [![Build Status](http://img.shields.io/travis/jurriaan/docparser.svg)](https://travis-ci.org/jurriaan/docparser) [![Dependency Status](http://img.shields.io/gemnasium/jurriaan/docparser.svg)](https://gemnasium.com/jurriaan/docparser)
4
4
 
5
5
 
6
6
  DocParser is a web scraping/screen scraping tool.
@@ -59,4 +59,4 @@ See [example.rb](https://github.com/jurriaan/docparser/blob/master/example.rb)
59
59
 
60
60
  ## Thanks
61
61
 
62
- - [randym](https://github.com/randym) - for providing the [axlsx](https://github.com/randym/axlsx) gem
62
+ - [randym](https://github.com/randym) - for providing the [axlsx](https://github.com/randym/axlsx) gem
data/Rakefile CHANGED
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'bundler/gem_tasks'
2
4
  require 'rake/testtask'
3
5
  require 'rubocop'
@@ -16,7 +18,7 @@ task :rubocop do
16
18
  puts "Running Rubocop #{RuboCop::Version::STRING}"
17
19
  args = FileList['**/*.rb', 'Rakefile', 'docparser.gemspec', 'Gemfile']
18
20
  cli = RuboCop::CLI.new
19
- fail unless cli.run(args) == 0
21
+ raise unless cli.run(args).zero?
20
22
  end
21
23
 
22
24
  task default: :test
@@ -1,4 +1,6 @@
1
- lib = File.expand_path('../lib', __FILE__)
1
+ # frozen_string_literal: true
2
+
3
+ lib = File.expand_path('lib', __dir__)
2
4
  $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
3
5
  require 'docparser/version'
4
6
 
@@ -14,18 +16,16 @@ Gem::Specification.new do |spec|
14
16
  spec.platform = Gem::Platform::RUBY
15
17
 
16
18
  spec.files = `git ls-files`.split($RS)
17
- spec.executables = spec.files.grep(/^bin\//) { |f| File.basename(f) }
18
- spec.test_files = spec.files.grep(/^(test|spec|features)\//)
19
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
20
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
21
  spec.require_paths = ['lib']
20
22
  spec.extra_rdoc_files = ['README.md', 'LICENSE']
21
23
 
22
- spec.add_runtime_dependency 'nokogiri', '~> 1.6.1'
23
- spec.add_runtime_dependency 'parallel', '~> 1.3.2'
24
24
  spec.add_runtime_dependency 'axlsx', '~> 2.0.1'
25
- spec.add_runtime_dependency 'log4r', '~> 1.1.10'
25
+ spec.add_runtime_dependency 'nokogiri', '~> 1.10.0'
26
+ spec.add_runtime_dependency 'parallel', '~> 1.10'
26
27
 
27
- spec.add_development_dependency 'yard'
28
- spec.add_development_dependency 'kramdown', '~> 1.4.1'
29
28
  spec.add_development_dependency 'github-markup'
30
- spec.required_ruby_version = '>= 2.0.0'
29
+ spec.add_development_dependency 'kramdown', '~> 2.1.0'
30
+ spec.add_development_dependency 'yard'
31
31
  end
data/example.rb CHANGED
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  #
2
4
  # An example of parsing hackaday.com
3
5
  # (C) 2013 Jurriaan Pruis
@@ -1 +1,3 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'docparser/parser'
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'nokogiri'
2
4
  module DocParser
3
5
  # The Document class loads and parses the files.
@@ -19,9 +21,10 @@ module DocParser
19
21
  # @return [String] the source of the document
20
22
  attr_reader :html
21
23
 
22
- def initialize(filename: nil, encoding: 'utf-8', parser: nil)
23
- @logger = Log4r::Logger.new('docparser::document')
24
- @logger.debug { "Parsing #{filename}" }
24
+ def initialize(filename: nil, encoding: 'utf-8', parser: nil, logger: nil)
25
+ @logger = logger || Logger.new(STDERR)
26
+ @logger.level = Logger::INFO
27
+ @logger.debug("Parsing #{filename}")
25
28
  @encoding = encoding
26
29
  @parser = parser
27
30
  @filename = filename
@@ -32,7 +35,7 @@ module DocParser
32
35
  # Adds a row to an output
33
36
  def add_row(*row, output: 0)
34
37
  output = @parser.outputs.index(output) if output.is_a? Output
35
- @logger.debug { "#{filename}: Adding row #{row.flatten}" }
38
+ @logger.debug("#{filename}: Adding row #{row.flatten}")
36
39
  results[output] << row.flatten
37
40
  end
38
41
 
@@ -42,9 +45,14 @@ module DocParser
42
45
  @title ||= xpath_content('//head/title')
43
46
  end
44
47
 
45
- # Executes a xpath query
46
- def xpath(query)
47
- res = @doc.search(query)
48
+ # Executes a xpath/css query
49
+ def elements(query)
50
+ @doc.search(query)
51
+ end
52
+
53
+ def each_element(query)
54
+ res = elements(query)
55
+
48
56
  if block_given?
49
57
  res.each { |el| yield el }
50
58
  else
@@ -54,7 +62,7 @@ module DocParser
54
62
 
55
63
  # Executes a xpath query and returns the content
56
64
  # @return [String] the content of the HTML node
57
- def xpath_content(query)
65
+ def element_content(query)
58
66
  first = @doc.search(query).first
59
67
  if first.nil?
60
68
  nil
@@ -91,7 +99,9 @@ module DocParser
91
99
  end
92
100
  end
93
101
 
94
- alias_method :css, :xpath
95
- alias_method :css_content, :xpath_content
102
+ alias css each_element
103
+ alias xpath each_element
104
+ alias css_content element_content
105
+ alias xpath_content element_content
96
106
  end
97
107
  end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module DocParser
2
4
  # The Output base class.
3
5
  # All Output classes inherit from this one.
@@ -24,10 +26,11 @@ module DocParser
24
26
  @filename = filename
25
27
  @uniq = uniq
26
28
  @uniqarr = []
27
- fail ArgumentError, 'Please specify a filename' if filename.empty?
29
+ raise ArgumentError, 'Please specify a filename' if filename.empty?
30
+
28
31
  @file = open filename, 'w'
29
- classname = self.class.name.split('::').last
30
- @logger = Log4r::Logger.new("docparser::output::#{classname}")
32
+ @logger = Logger.new(STDERR)
33
+ @logger.level = Logger::INFO
31
34
  open_file
32
35
  end
33
36
 
@@ -40,6 +43,7 @@ module DocParser
40
43
  # Adds a row
41
44
  def add_row(row)
42
45
  return if @uniq && @uniqarr.include?(row.hash)
46
+
43
47
  @rowcount += 1
44
48
  write_row row
45
49
  @uniqarr << row.hash
@@ -66,12 +70,11 @@ module DocParser
66
70
 
67
71
  # Called when a row is added
68
72
  def write_row(_row)
69
- fail NotImplementedError, 'No row writer defined'
73
+ raise NotImplementedError, 'No row writer defined'
70
74
  end
71
75
 
72
76
  # Called before closing the file
73
- def footer
74
- end
77
+ def footer; end
75
78
  end
76
79
 
77
80
  # MissingHeaderException gets thrown if a required header is missing.
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'csv'
2
4
  module DocParser
3
5
  # The CSVOutput class generates a CSV file containing all rows
@@ -1,65 +1,68 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'cgi'
2
4
  module DocParser
3
5
  # The XLSXOutput class generates an HTML file containing a table
4
6
  # @see Output
5
7
  class HTMLOutput < Output
6
8
  # @!visibility private
7
- HTMLHEADER = <<-EOS
8
- <!DOCTYPE html>
9
- <html>
10
- <head>
11
- <title>HTML output "#FILENAME#"</title>
12
- <meta charset="utf-8">
13
- <style type="text/css">
14
- body {
15
- font-family:"Helvetica Neue", Helvetica, Arial, sans-serif;
16
- font-size:12px;
17
- }
18
- table {
19
- border:1px solid #69c;
20
- border-collapse:collapse;
21
- font-size:12px;
22
- text-align:left;
23
- width:480px;
24
- }
25
- th {
26
- border-bottom:1px dashed #69c;
27
- color:#039;
28
- font-size:14px;
29
- font-weight:normal;
30
- padding:12px 17px;
31
- }
32
- td {
33
- color:#669;
34
- padding:7px 17px;
35
- white-space: pre;
36
- }
37
- tbody tr:hover td {
38
- background:#d0dafd;
39
- color:#339;
40
- }
41
- tbody tr:nth-child(even) {
42
- background:#e0eaff;
43
- }
44
- </style>
45
- </head>
46
- <body>
47
- <table>
48
- EOS
9
+ HTMLHEADER = <<~EOS
10
+ <!DOCTYPE html>
11
+ <html>
12
+ <head>
13
+ <title>HTML output "#FILENAME#"</title>
14
+ <meta charset="utf-8">
15
+ <style type="text/css">
16
+ body {
17
+ font-family:"Helvetica Neue", Helvetica, Arial, sans-serif;
18
+ font-size:12px;
19
+ }
20
+ table {
21
+ border:1px solid #69c;
22
+ border-collapse:collapse;
23
+ font-size:12px;
24
+ text-align:left;
25
+ width:480px;
26
+ }
27
+ th {
28
+ border-bottom:1px dashed #69c;
29
+ color:#039;
30
+ font-size:14px;
31
+ font-weight:normal;
32
+ padding:12px 17px;
33
+ }
34
+ td {
35
+ color:#669;
36
+ padding:7px 17px;
37
+ white-space: pre;
38
+ }
39
+ tbody tr:hover td {
40
+ background:#d0dafd;
41
+ color:#339;
42
+ }
43
+ tbody tr:nth-child(even) {
44
+ background:#e0eaff;
45
+ }
46
+ </style>
47
+ </head>
48
+ <body>
49
+ <table>
50
+ EOS
49
51
  # @!visibility private
50
- HTMLFOOTER = <<-EOS
51
- </tbody>
52
- </table>
53
- <p>#COUNT# rows</p>
54
- </body>
55
- </html>
56
- EOS
52
+ HTMLFOOTER = <<~EOS
53
+ </tbody>
54
+ </table>
55
+ <p>#COUNT# rows</p>
56
+ </body>
57
+ </html>
58
+ EOS
57
59
  def open_file
58
60
  @file << HTMLHEADER.gsub('#FILENAME#', @filename)
59
61
  end
60
62
 
61
63
  def header
62
64
  return if @header.nil? || @header.empty?
65
+
63
66
  @file << '<thead><tr>'
64
67
  @file << @header.map { |f| '<th>' + f + '</th>' }.join
65
68
  @file << "</tr></thead>\n<tbody>\n"
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'json'
2
4
  module DocParser
3
5
  # The JSONOutput class generates a JSON file containing all rows as seperate
@@ -11,7 +13,7 @@ module DocParser
11
13
  end
12
14
 
13
15
  def write_row(row)
14
- fail MissingHeaderException if @header.nil? || @header.length == 0
16
+ raise MissingHeaderException if @header.nil? || @header.empty?
15
17
 
16
18
  @file << ',' unless @file.pos <= 1
17
19
 
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module DocParser
2
4
  # The MultiOutput output combines multiple outputs.
3
5
  # It creates a CSV, HTML, YAML and XLSX Output file
@@ -9,7 +11,7 @@ module DocParser
9
11
  class MultiOutput < Output
10
12
  # All the possible outputs
11
13
  OUTPUT_TYPES = { csv: CSVOutput, html: HTMLOutput, yml: YAMLOutput,
12
- xlsx: XLSXOutput, json: JSONOutput }
14
+ xlsx: XLSXOutput, json: JSONOutput }.freeze
13
15
 
14
16
  # @!visibility private
15
17
  def initialize(**options)