docparser 0.2.3 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (39) hide show
  1. checksums.yaml +5 -5
  2. data/.rubocop.yml +15 -3
  3. data/.rubocop_todo.yml +45 -0
  4. data/.travis.yml +1 -1
  5. data/Gemfile +5 -4
  6. data/README.md +2 -2
  7. data/Rakefile +3 -1
  8. data/docparser.gemspec +9 -9
  9. data/example.rb +2 -0
  10. data/lib/docparser.rb +2 -0
  11. data/lib/docparser/document.rb +20 -10
  12. data/lib/docparser/output.rb +9 -6
  13. data/lib/docparser/output/csv_output.rb +2 -0
  14. data/lib/docparser/output/html_output.rb +52 -49
  15. data/lib/docparser/output/json_output.rb +3 -1
  16. data/lib/docparser/output/multi_output.rb +3 -1
  17. data/lib/docparser/output/nil_output.rb +5 -6
  18. data/lib/docparser/output/xlsx_output.rb +2 -0
  19. data/lib/docparser/output/yaml_output.rb +4 -1
  20. data/lib/docparser/parser.rb +9 -13
  21. data/lib/docparser/version.rb +3 -1
  22. data/test/.rubocop.yml +6 -2
  23. data/test/.rubocop_todo.yml +23 -0
  24. data/test/lib/docparser/blackbox_test.rb +5 -4
  25. data/test/lib/docparser/document_test.rb +19 -14
  26. data/test/lib/docparser/output/csv_output_test.rb +5 -10
  27. data/test/lib/docparser/output/html_output_test.rb +5 -10
  28. data/test/lib/docparser/output/json_output_test.rb +8 -13
  29. data/test/lib/docparser/output/multi_output_test.rb +6 -12
  30. data/test/lib/docparser/output/nil_output_test.rb +4 -9
  31. data/test/lib/docparser/output/xlsx_output_test.rb +5 -10
  32. data/test/lib/docparser/output/yaml_output_test.rb +22 -27
  33. data/test/lib/docparser/output_test.rb +3 -8
  34. data/test/lib/docparser/parser_test.rb +2 -22
  35. data/test/lib/docparser/version_test.rb +2 -0
  36. data/test/support/hackaday/dl.rb +2 -0
  37. data/test/test_helper.rb +2 -3
  38. metadata +20 -35
  39. data/test/lib/docparser/logging_test.rb +0 -19
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 69310d210a064254c5c767509118e5f9d4a1bc83
4
- data.tar.gz: 2e39c9e8c9b8833c17277847fc89b460173de70c
2
+ SHA256:
3
+ metadata.gz: 3a857af8eb8403f2f26f867f02e13319fcb173c6f177947986d5fe81e7248a74
4
+ data.tar.gz: 53e66974c57f2662606f1125a0466ca0ce7db476cae0516da71837b3703263e4
5
5
  SHA512:
6
- metadata.gz: 464c4e463de25d0b961476253f54cf13718a2129bab265382d5a3bc34d90a5ecb9d24eead1718688a485b5c9405cec0f6316b7cd2e7741446f3a253341e2cdb2
7
- data.tar.gz: 682813ef09039f64e4285c4db5986f7c2be6387e7df1f608e6294804e583a3d7c11c5427901d093cacdab03f8afcd9f6f0a39ab77f90df395ac7993e77cc28db
6
+ metadata.gz: 70c5721acae9866e862a6747a6749c894bb38dd20792545ad372b8f5265f9a4ca2f5a895e30fe36fc17eb2e511ffe86ec837974d2430ee1864b2b6773ce08b6c
7
+ data.tar.gz: 0b7cd82a97ad79b78bebab10319403cb6cd9cbee27137912315ede6ccbaa432f87d9a9267123724021f119c9f3d67de73dd57350455d3c00bd63132047b1f57c
@@ -1,6 +1,18 @@
1
+ inherit_from: .rubocop_todo.yml
2
+
1
3
  # Temporary turn this off
2
4
  # Avoid parameter lists longer than three or four parameters.
3
- ParameterLists:
5
+ Metrics/ParameterLists:
4
6
  Enabled: false
5
- MultilineBlockChain:
6
- Enabled: false
7
+ Style/MultilineBlockChain:
8
+ Enabled: false
9
+ Style/HashTransformKeys:
10
+ Enabled: true
11
+ Style/HashTransformValues:
12
+ Enabled: true
13
+ Style/HashEachMethods:
14
+ Enabled: true
15
+ Lint/RaiseException:
16
+ Enabled: true
17
+ Lint/StructNewOverride:
18
+ Enabled: true
@@ -0,0 +1,45 @@
1
+ # This configuration was generated by
2
+ # `rubocop --auto-gen-config`
3
+ # on 2020-04-13 17:55:59 +0200 using RuboCop version 0.81.0.
4
+ # The point is for the user to remove these configuration records
5
+ # one by one as the offenses are removed from the code base.
6
+ # Note that changes in the inspected code, or installation of new
7
+ # versions of RuboCop, may require this file to be generated again.
8
+
9
+ # Offense count: 9
10
+ # Configuration parameters: CountComments, ExcludedMethods.
11
+ # ExcludedMethods: refine
12
+ Metrics/BlockLength:
13
+ Max: 173
14
+
15
+ # Offense count: 2
16
+ # Configuration parameters: ForbiddenDelimiters.
17
+ # ForbiddenDelimiters: (?-mix:(^|\s)(EO[A-Z]{1}|END)(\s|$))
18
+ Naming/HeredocDelimiterNaming:
19
+ Exclude:
20
+ - 'lib/docparser/output/html_output.rb'
21
+
22
+ # Offense count: 16
23
+ Security/Open:
24
+ Exclude:
25
+ - 'lib/docparser/document.rb'
26
+ - 'lib/docparser/output.rb'
27
+ - 'test/lib/docparser/document_test.rb'
28
+ - 'test/lib/docparser/output/csv_output_test.rb'
29
+ - 'test/lib/docparser/output/html_output_test.rb'
30
+ - 'test/lib/docparser/output/json_output_test.rb'
31
+ - 'test/lib/docparser/output/multi_output_test.rb'
32
+ - 'test/lib/docparser/output/yaml_output_test.rb'
33
+
34
+ # Offense count: 4
35
+ # Configuration parameters: EnforcedStyle.
36
+ # SupportedStyles: annotated, template, unannotated
37
+ Style/FormatStringToken:
38
+ Exclude:
39
+ - 'lib/docparser/output.rb'
40
+ - 'lib/docparser/parser.rb'
41
+
42
+ # Offense count: 1
43
+ Style/MixinUsage:
44
+ Exclude:
45
+ - 'example.rb'
@@ -1,7 +1,7 @@
1
1
  language: ruby
2
2
  rvm:
3
3
  - 2.0.0
4
- - 2.1.1
4
+ - 2.1.2
5
5
  - rbx-2.2.5
6
6
  - ruby-head
7
7
  - jruby-head
data/Gemfile CHANGED
@@ -1,12 +1,13 @@
1
+ # frozen_string_literal: true
2
+
1
3
  gemspec
2
4
 
3
5
  source 'https://rubygems.org'
4
6
 
5
7
  group :test do
6
- gem 'minitest', '~> 5.4.1'
7
- gem 'coveralls', require: false
8
+ gem 'minitest', '~> 5.14.0'
8
9
  gem 'rake'
9
- gem 'rubocop', '~> 0.26.0'
10
- gem 'simplecov', require: false
10
+ gem 'rubocop', '~> 0.81.0'
11
11
  gem 'simple_mock'
12
+ gem 'simplecov', require: false
12
13
  end
data/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # DocParser
2
2
 
3
- [![Gem Version](https://badge.fury.io/rb/docparser.png)](http://badge.fury.io/rb/docparser) [![Build Status](https://travis-ci.org/jurriaan/docparser.png?branch=master)](https://travis-ci.org/jurriaan/docparser) [![Dependency Status](https://gemnasium.com/jurriaan/docparser.png)](https://gemnasium.com/jurriaan/docparser) [![Coverage Status](https://coveralls.io/repos/jurriaan/docparser/badge.png?branch=master)](https://coveralls.io/r/jurriaan/docparser)
3
+ [![Gem Version](http://img.shields.io/gem/v/docparser.svg)](http://badge.fury.io/rb/docparser) [![Build Status](http://img.shields.io/travis/jurriaan/docparser.svg)](https://travis-ci.org/jurriaan/docparser) [![Dependency Status](http://img.shields.io/gemnasium/jurriaan/docparser.svg)](https://gemnasium.com/jurriaan/docparser)
4
4
 
5
5
 
6
6
  DocParser is a web scraping/screen scraping tool.
@@ -59,4 +59,4 @@ See [example.rb](https://github.com/jurriaan/docparser/blob/master/example.rb)
59
59
 
60
60
  ## Thanks
61
61
 
62
- - [randym](https://github.com/randym) - for providing the [axlsx](https://github.com/randym/axlsx) gem
62
+ - [randym](https://github.com/randym) - for providing the [axlsx](https://github.com/randym/axlsx) gem
data/Rakefile CHANGED
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'bundler/gem_tasks'
2
4
  require 'rake/testtask'
3
5
  require 'rubocop'
@@ -16,7 +18,7 @@ task :rubocop do
16
18
  puts "Running Rubocop #{RuboCop::Version::STRING}"
17
19
  args = FileList['**/*.rb', 'Rakefile', 'docparser.gemspec', 'Gemfile']
18
20
  cli = RuboCop::CLI.new
19
- fail unless cli.run(args) == 0
21
+ raise unless cli.run(args).zero?
20
22
  end
21
23
 
22
24
  task default: :test
@@ -1,4 +1,6 @@
1
- lib = File.expand_path('../lib', __FILE__)
1
+ # frozen_string_literal: true
2
+
3
+ lib = File.expand_path('lib', __dir__)
2
4
  $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
3
5
  require 'docparser/version'
4
6
 
@@ -14,18 +16,16 @@ Gem::Specification.new do |spec|
14
16
  spec.platform = Gem::Platform::RUBY
15
17
 
16
18
  spec.files = `git ls-files`.split($RS)
17
- spec.executables = spec.files.grep(/^bin\//) { |f| File.basename(f) }
18
- spec.test_files = spec.files.grep(/^(test|spec|features)\//)
19
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
20
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
21
  spec.require_paths = ['lib']
20
22
  spec.extra_rdoc_files = ['README.md', 'LICENSE']
21
23
 
22
- spec.add_runtime_dependency 'nokogiri', '~> 1.6.1'
23
- spec.add_runtime_dependency 'parallel', '~> 1.3.2'
24
24
  spec.add_runtime_dependency 'axlsx', '~> 2.0.1'
25
- spec.add_runtime_dependency 'log4r', '~> 1.1.10'
25
+ spec.add_runtime_dependency 'nokogiri', '~> 1.10.0'
26
+ spec.add_runtime_dependency 'parallel', '~> 1.10'
26
27
 
27
- spec.add_development_dependency 'yard'
28
- spec.add_development_dependency 'kramdown', '~> 1.4.1'
29
28
  spec.add_development_dependency 'github-markup'
30
- spec.required_ruby_version = '>= 2.0.0'
29
+ spec.add_development_dependency 'kramdown', '~> 2.1.0'
30
+ spec.add_development_dependency 'yard'
31
31
  end
data/example.rb CHANGED
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  #
2
4
  # An example of parsing hackaday.com
3
5
  # (C) 2013 Jurriaan Pruis
@@ -1 +1,3 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'docparser/parser'
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'nokogiri'
2
4
  module DocParser
3
5
  # The Document class loads and parses the files.
@@ -19,9 +21,10 @@ module DocParser
19
21
  # @return [String] the source of the document
20
22
  attr_reader :html
21
23
 
22
- def initialize(filename: nil, encoding: 'utf-8', parser: nil)
23
- @logger = Log4r::Logger.new('docparser::document')
24
- @logger.debug { "Parsing #{filename}" }
24
+ def initialize(filename: nil, encoding: 'utf-8', parser: nil, logger: nil)
25
+ @logger = logger || Logger.new(STDERR)
26
+ @logger.level = Logger::INFO
27
+ @logger.debug("Parsing #{filename}")
25
28
  @encoding = encoding
26
29
  @parser = parser
27
30
  @filename = filename
@@ -32,7 +35,7 @@ module DocParser
32
35
  # Adds a row to an output
33
36
  def add_row(*row, output: 0)
34
37
  output = @parser.outputs.index(output) if output.is_a? Output
35
- @logger.debug { "#{filename}: Adding row #{row.flatten}" }
38
+ @logger.debug("#{filename}: Adding row #{row.flatten}")
36
39
  results[output] << row.flatten
37
40
  end
38
41
 
@@ -42,9 +45,14 @@ module DocParser
42
45
  @title ||= xpath_content('//head/title')
43
46
  end
44
47
 
45
- # Executes a xpath query
46
- def xpath(query)
47
- res = @doc.search(query)
48
+ # Executes a xpath/css query
49
+ def elements(query)
50
+ @doc.search(query)
51
+ end
52
+
53
+ def each_element(query)
54
+ res = elements(query)
55
+
48
56
  if block_given?
49
57
  res.each { |el| yield el }
50
58
  else
@@ -54,7 +62,7 @@ module DocParser
54
62
 
55
63
  # Executes a xpath query and returns the content
56
64
  # @return [String] the content of the HTML node
57
- def xpath_content(query)
65
+ def element_content(query)
58
66
  first = @doc.search(query).first
59
67
  if first.nil?
60
68
  nil
@@ -91,7 +99,9 @@ module DocParser
91
99
  end
92
100
  end
93
101
 
94
- alias_method :css, :xpath
95
- alias_method :css_content, :xpath_content
102
+ alias css each_element
103
+ alias xpath each_element
104
+ alias css_content element_content
105
+ alias xpath_content element_content
96
106
  end
97
107
  end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module DocParser
2
4
  # The Output base class.
3
5
  # All Output classes inherit from this one.
@@ -24,10 +26,11 @@ module DocParser
24
26
  @filename = filename
25
27
  @uniq = uniq
26
28
  @uniqarr = []
27
- fail ArgumentError, 'Please specify a filename' if filename.empty?
29
+ raise ArgumentError, 'Please specify a filename' if filename.empty?
30
+
28
31
  @file = open filename, 'w'
29
- classname = self.class.name.split('::').last
30
- @logger = Log4r::Logger.new("docparser::output::#{classname}")
32
+ @logger = Logger.new(STDERR)
33
+ @logger.level = Logger::INFO
31
34
  open_file
32
35
  end
33
36
 
@@ -40,6 +43,7 @@ module DocParser
40
43
  # Adds a row
41
44
  def add_row(row)
42
45
  return if @uniq && @uniqarr.include?(row.hash)
46
+
43
47
  @rowcount += 1
44
48
  write_row row
45
49
  @uniqarr << row.hash
@@ -66,12 +70,11 @@ module DocParser
66
70
 
67
71
  # Called when a row is added
68
72
  def write_row(_row)
69
- fail NotImplementedError, 'No row writer defined'
73
+ raise NotImplementedError, 'No row writer defined'
70
74
  end
71
75
 
72
76
  # Called before closing the file
73
- def footer
74
- end
77
+ def footer; end
75
78
  end
76
79
 
77
80
  # MissingHeaderException gets thrown if a required header is missing.
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'csv'
2
4
  module DocParser
3
5
  # The CSVOutput class generates a CSV file containing all rows
@@ -1,65 +1,68 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'cgi'
2
4
  module DocParser
3
5
  # The XLSXOutput class generates an HTML file containing a table
4
6
  # @see Output
5
7
  class HTMLOutput < Output
6
8
  # @!visibility private
7
- HTMLHEADER = <<-EOS
8
- <!DOCTYPE html>
9
- <html>
10
- <head>
11
- <title>HTML output "#FILENAME#"</title>
12
- <meta charset="utf-8">
13
- <style type="text/css">
14
- body {
15
- font-family:"Helvetica Neue", Helvetica, Arial, sans-serif;
16
- font-size:12px;
17
- }
18
- table {
19
- border:1px solid #69c;
20
- border-collapse:collapse;
21
- font-size:12px;
22
- text-align:left;
23
- width:480px;
24
- }
25
- th {
26
- border-bottom:1px dashed #69c;
27
- color:#039;
28
- font-size:14px;
29
- font-weight:normal;
30
- padding:12px 17px;
31
- }
32
- td {
33
- color:#669;
34
- padding:7px 17px;
35
- white-space: pre;
36
- }
37
- tbody tr:hover td {
38
- background:#d0dafd;
39
- color:#339;
40
- }
41
- tbody tr:nth-child(even) {
42
- background:#e0eaff;
43
- }
44
- </style>
45
- </head>
46
- <body>
47
- <table>
48
- EOS
9
+ HTMLHEADER = <<~EOS
10
+ <!DOCTYPE html>
11
+ <html>
12
+ <head>
13
+ <title>HTML output "#FILENAME#"</title>
14
+ <meta charset="utf-8">
15
+ <style type="text/css">
16
+ body {
17
+ font-family:"Helvetica Neue", Helvetica, Arial, sans-serif;
18
+ font-size:12px;
19
+ }
20
+ table {
21
+ border:1px solid #69c;
22
+ border-collapse:collapse;
23
+ font-size:12px;
24
+ text-align:left;
25
+ width:480px;
26
+ }
27
+ th {
28
+ border-bottom:1px dashed #69c;
29
+ color:#039;
30
+ font-size:14px;
31
+ font-weight:normal;
32
+ padding:12px 17px;
33
+ }
34
+ td {
35
+ color:#669;
36
+ padding:7px 17px;
37
+ white-space: pre;
38
+ }
39
+ tbody tr:hover td {
40
+ background:#d0dafd;
41
+ color:#339;
42
+ }
43
+ tbody tr:nth-child(even) {
44
+ background:#e0eaff;
45
+ }
46
+ </style>
47
+ </head>
48
+ <body>
49
+ <table>
50
+ EOS
49
51
  # @!visibility private
50
- HTMLFOOTER = <<-EOS
51
- </tbody>
52
- </table>
53
- <p>#COUNT# rows</p>
54
- </body>
55
- </html>
56
- EOS
52
+ HTMLFOOTER = <<~EOS
53
+ </tbody>
54
+ </table>
55
+ <p>#COUNT# rows</p>
56
+ </body>
57
+ </html>
58
+ EOS
57
59
  def open_file
58
60
  @file << HTMLHEADER.gsub('#FILENAME#', @filename)
59
61
  end
60
62
 
61
63
  def header
62
64
  return if @header.nil? || @header.empty?
65
+
63
66
  @file << '<thead><tr>'
64
67
  @file << @header.map { |f| '<th>' + f + '</th>' }.join
65
68
  @file << "</tr></thead>\n<tbody>\n"
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'json'
2
4
  module DocParser
3
5
  # The JSONOutput class generates a JSON file containing all rows as seperate
@@ -11,7 +13,7 @@ module DocParser
11
13
  end
12
14
 
13
15
  def write_row(row)
14
- fail MissingHeaderException if @header.nil? || @header.length == 0
16
+ raise MissingHeaderException if @header.nil? || @header.empty?
15
17
 
16
18
  @file << ',' unless @file.pos <= 1
17
19
 
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module DocParser
2
4
  # The MultiOutput output combines multiple outputs.
3
5
  # It creates a CSV, HTML, YAML and XLSX Output file
@@ -9,7 +11,7 @@ module DocParser
9
11
  class MultiOutput < Output
10
12
  # All the possible outputs
11
13
  OUTPUT_TYPES = { csv: CSVOutput, html: HTMLOutput, yml: YAMLOutput,
12
- xlsx: XLSXOutput, json: JSONOutput }
14
+ xlsx: XLSXOutput, json: JSONOutput }.freeze
13
15
 
14
16
  # @!visibility private
15
17
  def initialize(**options)