docparser 0.2.3 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.rubocop.yml +15 -3
- data/.rubocop_todo.yml +45 -0
- data/.travis.yml +1 -1
- data/Gemfile +5 -4
- data/README.md +2 -2
- data/Rakefile +3 -1
- data/docparser.gemspec +9 -9
- data/example.rb +2 -0
- data/lib/docparser.rb +2 -0
- data/lib/docparser/document.rb +20 -10
- data/lib/docparser/output.rb +9 -6
- data/lib/docparser/output/csv_output.rb +2 -0
- data/lib/docparser/output/html_output.rb +52 -49
- data/lib/docparser/output/json_output.rb +3 -1
- data/lib/docparser/output/multi_output.rb +3 -1
- data/lib/docparser/output/nil_output.rb +5 -6
- data/lib/docparser/output/xlsx_output.rb +2 -0
- data/lib/docparser/output/yaml_output.rb +4 -1
- data/lib/docparser/parser.rb +9 -13
- data/lib/docparser/version.rb +3 -1
- data/test/.rubocop.yml +6 -2
- data/test/.rubocop_todo.yml +23 -0
- data/test/lib/docparser/blackbox_test.rb +5 -4
- data/test/lib/docparser/document_test.rb +19 -14
- data/test/lib/docparser/output/csv_output_test.rb +5 -10
- data/test/lib/docparser/output/html_output_test.rb +5 -10
- data/test/lib/docparser/output/json_output_test.rb +8 -13
- data/test/lib/docparser/output/multi_output_test.rb +6 -12
- data/test/lib/docparser/output/nil_output_test.rb +4 -9
- data/test/lib/docparser/output/xlsx_output_test.rb +5 -10
- data/test/lib/docparser/output/yaml_output_test.rb +22 -27
- data/test/lib/docparser/output_test.rb +3 -8
- data/test/lib/docparser/parser_test.rb +2 -22
- data/test/lib/docparser/version_test.rb +2 -0
- data/test/support/hackaday/dl.rb +2 -0
- data/test/test_helper.rb +2 -3
- metadata +20 -35
- data/test/lib/docparser/logging_test.rb +0 -19
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 3a857af8eb8403f2f26f867f02e13319fcb173c6f177947986d5fe81e7248a74
|
4
|
+
data.tar.gz: 53e66974c57f2662606f1125a0466ca0ce7db476cae0516da71837b3703263e4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 70c5721acae9866e862a6747a6749c894bb38dd20792545ad372b8f5265f9a4ca2f5a895e30fe36fc17eb2e511ffe86ec837974d2430ee1864b2b6773ce08b6c
|
7
|
+
data.tar.gz: 0b7cd82a97ad79b78bebab10319403cb6cd9cbee27137912315ede6ccbaa432f87d9a9267123724021f119c9f3d67de73dd57350455d3c00bd63132047b1f57c
|
data/.rubocop.yml
CHANGED
@@ -1,6 +1,18 @@
|
|
1
|
+
inherit_from: .rubocop_todo.yml
|
2
|
+
|
1
3
|
# Temporary turn this off
|
2
4
|
# Avoid parameter lists longer than three or four parameters.
|
3
|
-
ParameterLists:
|
5
|
+
Metrics/ParameterLists:
|
4
6
|
Enabled: false
|
5
|
-
MultilineBlockChain:
|
6
|
-
Enabled: false
|
7
|
+
Style/MultilineBlockChain:
|
8
|
+
Enabled: false
|
9
|
+
Style/HashTransformKeys:
|
10
|
+
Enabled: true
|
11
|
+
Style/HashTransformValues:
|
12
|
+
Enabled: true
|
13
|
+
Style/HashEachMethods:
|
14
|
+
Enabled: true
|
15
|
+
Lint/RaiseException:
|
16
|
+
Enabled: true
|
17
|
+
Lint/StructNewOverride:
|
18
|
+
Enabled: true
|
data/.rubocop_todo.yml
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
# This configuration was generated by
|
2
|
+
# `rubocop --auto-gen-config`
|
3
|
+
# on 2020-04-13 17:55:59 +0200 using RuboCop version 0.81.0.
|
4
|
+
# The point is for the user to remove these configuration records
|
5
|
+
# one by one as the offenses are removed from the code base.
|
6
|
+
# Note that changes in the inspected code, or installation of new
|
7
|
+
# versions of RuboCop, may require this file to be generated again.
|
8
|
+
|
9
|
+
# Offense count: 9
|
10
|
+
# Configuration parameters: CountComments, ExcludedMethods.
|
11
|
+
# ExcludedMethods: refine
|
12
|
+
Metrics/BlockLength:
|
13
|
+
Max: 173
|
14
|
+
|
15
|
+
# Offense count: 2
|
16
|
+
# Configuration parameters: ForbiddenDelimiters.
|
17
|
+
# ForbiddenDelimiters: (?-mix:(^|\s)(EO[A-Z]{1}|END)(\s|$))
|
18
|
+
Naming/HeredocDelimiterNaming:
|
19
|
+
Exclude:
|
20
|
+
- 'lib/docparser/output/html_output.rb'
|
21
|
+
|
22
|
+
# Offense count: 16
|
23
|
+
Security/Open:
|
24
|
+
Exclude:
|
25
|
+
- 'lib/docparser/document.rb'
|
26
|
+
- 'lib/docparser/output.rb'
|
27
|
+
- 'test/lib/docparser/document_test.rb'
|
28
|
+
- 'test/lib/docparser/output/csv_output_test.rb'
|
29
|
+
- 'test/lib/docparser/output/html_output_test.rb'
|
30
|
+
- 'test/lib/docparser/output/json_output_test.rb'
|
31
|
+
- 'test/lib/docparser/output/multi_output_test.rb'
|
32
|
+
- 'test/lib/docparser/output/yaml_output_test.rb'
|
33
|
+
|
34
|
+
# Offense count: 4
|
35
|
+
# Configuration parameters: EnforcedStyle.
|
36
|
+
# SupportedStyles: annotated, template, unannotated
|
37
|
+
Style/FormatStringToken:
|
38
|
+
Exclude:
|
39
|
+
- 'lib/docparser/output.rb'
|
40
|
+
- 'lib/docparser/parser.rb'
|
41
|
+
|
42
|
+
# Offense count: 1
|
43
|
+
Style/MixinUsage:
|
44
|
+
Exclude:
|
45
|
+
- 'example.rb'
|
data/.travis.yml
CHANGED
data/Gemfile
CHANGED
@@ -1,12 +1,13 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
gemspec
|
2
4
|
|
3
5
|
source 'https://rubygems.org'
|
4
6
|
|
5
7
|
group :test do
|
6
|
-
gem 'minitest', '~> 5.
|
7
|
-
gem 'coveralls', require: false
|
8
|
+
gem 'minitest', '~> 5.14.0'
|
8
9
|
gem 'rake'
|
9
|
-
gem 'rubocop', '~> 0.
|
10
|
-
gem 'simplecov', require: false
|
10
|
+
gem 'rubocop', '~> 0.81.0'
|
11
11
|
gem 'simple_mock'
|
12
|
+
gem 'simplecov', require: false
|
12
13
|
end
|
data/README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# DocParser
|
2
2
|
|
3
|
-
[![Gem Version](
|
3
|
+
[![Gem Version](http://img.shields.io/gem/v/docparser.svg)](http://badge.fury.io/rb/docparser) [![Build Status](http://img.shields.io/travis/jurriaan/docparser.svg)](https://travis-ci.org/jurriaan/docparser) [![Dependency Status](http://img.shields.io/gemnasium/jurriaan/docparser.svg)](https://gemnasium.com/jurriaan/docparser)
|
4
4
|
|
5
5
|
|
6
6
|
DocParser is a web scraping/screen scraping tool.
|
@@ -59,4 +59,4 @@ See [example.rb](https://github.com/jurriaan/docparser/blob/master/example.rb)
|
|
59
59
|
|
60
60
|
## Thanks
|
61
61
|
|
62
|
-
- [randym](https://github.com/randym) - for providing the [axlsx](https://github.com/randym/axlsx) gem
|
62
|
+
- [randym](https://github.com/randym) - for providing the [axlsx](https://github.com/randym/axlsx) gem
|
data/Rakefile
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'bundler/gem_tasks'
|
2
4
|
require 'rake/testtask'
|
3
5
|
require 'rubocop'
|
@@ -16,7 +18,7 @@ task :rubocop do
|
|
16
18
|
puts "Running Rubocop #{RuboCop::Version::STRING}"
|
17
19
|
args = FileList['**/*.rb', 'Rakefile', 'docparser.gemspec', 'Gemfile']
|
18
20
|
cli = RuboCop::CLI.new
|
19
|
-
|
21
|
+
raise unless cli.run(args).zero?
|
20
22
|
end
|
21
23
|
|
22
24
|
task default: :test
|
data/docparser.gemspec
CHANGED
@@ -1,4 +1,6 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
lib = File.expand_path('lib', __dir__)
|
2
4
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
3
5
|
require 'docparser/version'
|
4
6
|
|
@@ -14,18 +16,16 @@ Gem::Specification.new do |spec|
|
|
14
16
|
spec.platform = Gem::Platform::RUBY
|
15
17
|
|
16
18
|
spec.files = `git ls-files`.split($RS)
|
17
|
-
spec.executables = spec.files.grep(
|
18
|
-
spec.test_files = spec.files.grep(
|
19
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
20
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
21
|
spec.require_paths = ['lib']
|
20
22
|
spec.extra_rdoc_files = ['README.md', 'LICENSE']
|
21
23
|
|
22
|
-
spec.add_runtime_dependency 'nokogiri', '~> 1.6.1'
|
23
|
-
spec.add_runtime_dependency 'parallel', '~> 1.3.2'
|
24
24
|
spec.add_runtime_dependency 'axlsx', '~> 2.0.1'
|
25
|
-
spec.add_runtime_dependency '
|
25
|
+
spec.add_runtime_dependency 'nokogiri', '~> 1.10.0'
|
26
|
+
spec.add_runtime_dependency 'parallel', '~> 1.10'
|
26
27
|
|
27
|
-
spec.add_development_dependency 'yard'
|
28
|
-
spec.add_development_dependency 'kramdown', '~> 1.4.1'
|
29
28
|
spec.add_development_dependency 'github-markup'
|
30
|
-
spec.
|
29
|
+
spec.add_development_dependency 'kramdown', '~> 2.1.0'
|
30
|
+
spec.add_development_dependency 'yard'
|
31
31
|
end
|
data/example.rb
CHANGED
data/lib/docparser.rb
CHANGED
data/lib/docparser/document.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'nokogiri'
|
2
4
|
module DocParser
|
3
5
|
# The Document class loads and parses the files.
|
@@ -19,9 +21,10 @@ module DocParser
|
|
19
21
|
# @return [String] the source of the document
|
20
22
|
attr_reader :html
|
21
23
|
|
22
|
-
def initialize(filename: nil, encoding: 'utf-8', parser: nil)
|
23
|
-
@logger =
|
24
|
-
@logger.
|
24
|
+
def initialize(filename: nil, encoding: 'utf-8', parser: nil, logger: nil)
|
25
|
+
@logger = logger || Logger.new(STDERR)
|
26
|
+
@logger.level = Logger::INFO
|
27
|
+
@logger.debug("Parsing #{filename}")
|
25
28
|
@encoding = encoding
|
26
29
|
@parser = parser
|
27
30
|
@filename = filename
|
@@ -32,7 +35,7 @@ module DocParser
|
|
32
35
|
# Adds a row to an output
|
33
36
|
def add_row(*row, output: 0)
|
34
37
|
output = @parser.outputs.index(output) if output.is_a? Output
|
35
|
-
@logger.debug
|
38
|
+
@logger.debug("#{filename}: Adding row #{row.flatten}")
|
36
39
|
results[output] << row.flatten
|
37
40
|
end
|
38
41
|
|
@@ -42,9 +45,14 @@ module DocParser
|
|
42
45
|
@title ||= xpath_content('//head/title')
|
43
46
|
end
|
44
47
|
|
45
|
-
# Executes a xpath query
|
46
|
-
def
|
47
|
-
|
48
|
+
# Executes a xpath/css query
|
49
|
+
def elements(query)
|
50
|
+
@doc.search(query)
|
51
|
+
end
|
52
|
+
|
53
|
+
def each_element(query)
|
54
|
+
res = elements(query)
|
55
|
+
|
48
56
|
if block_given?
|
49
57
|
res.each { |el| yield el }
|
50
58
|
else
|
@@ -54,7 +62,7 @@ module DocParser
|
|
54
62
|
|
55
63
|
# Executes a xpath query and returns the content
|
56
64
|
# @return [String] the content of the HTML node
|
57
|
-
def
|
65
|
+
def element_content(query)
|
58
66
|
first = @doc.search(query).first
|
59
67
|
if first.nil?
|
60
68
|
nil
|
@@ -91,7 +99,9 @@ module DocParser
|
|
91
99
|
end
|
92
100
|
end
|
93
101
|
|
94
|
-
|
95
|
-
|
102
|
+
alias css each_element
|
103
|
+
alias xpath each_element
|
104
|
+
alias css_content element_content
|
105
|
+
alias xpath_content element_content
|
96
106
|
end
|
97
107
|
end
|
data/lib/docparser/output.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module DocParser
|
2
4
|
# The Output base class.
|
3
5
|
# All Output classes inherit from this one.
|
@@ -24,10 +26,11 @@ module DocParser
|
|
24
26
|
@filename = filename
|
25
27
|
@uniq = uniq
|
26
28
|
@uniqarr = []
|
27
|
-
|
29
|
+
raise ArgumentError, 'Please specify a filename' if filename.empty?
|
30
|
+
|
28
31
|
@file = open filename, 'w'
|
29
|
-
|
30
|
-
@logger =
|
32
|
+
@logger = Logger.new(STDERR)
|
33
|
+
@logger.level = Logger::INFO
|
31
34
|
open_file
|
32
35
|
end
|
33
36
|
|
@@ -40,6 +43,7 @@ module DocParser
|
|
40
43
|
# Adds a row
|
41
44
|
def add_row(row)
|
42
45
|
return if @uniq && @uniqarr.include?(row.hash)
|
46
|
+
|
43
47
|
@rowcount += 1
|
44
48
|
write_row row
|
45
49
|
@uniqarr << row.hash
|
@@ -66,12 +70,11 @@ module DocParser
|
|
66
70
|
|
67
71
|
# Called when a row is added
|
68
72
|
def write_row(_row)
|
69
|
-
|
73
|
+
raise NotImplementedError, 'No row writer defined'
|
70
74
|
end
|
71
75
|
|
72
76
|
# Called before closing the file
|
73
|
-
def footer
|
74
|
-
end
|
77
|
+
def footer; end
|
75
78
|
end
|
76
79
|
|
77
80
|
# MissingHeaderException gets thrown if a required header is missing.
|
@@ -1,65 +1,68 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'cgi'
|
2
4
|
module DocParser
|
3
5
|
# The XLSXOutput class generates an HTML file containing a table
|
4
6
|
# @see Output
|
5
7
|
class HTMLOutput < Output
|
6
8
|
# @!visibility private
|
7
|
-
HTMLHEADER =
|
8
|
-
<!DOCTYPE html>
|
9
|
-
<html>
|
10
|
-
<head>
|
11
|
-
<title>HTML output "#FILENAME#"</title>
|
12
|
-
<meta charset="utf-8">
|
13
|
-
<style type="text/css">
|
14
|
-
body {
|
15
|
-
|
16
|
-
|
17
|
-
}
|
18
|
-
table {
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
}
|
25
|
-
th {
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
}
|
32
|
-
td {
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
}
|
37
|
-
tbody tr:hover td {
|
38
|
-
|
39
|
-
|
40
|
-
}
|
41
|
-
tbody tr:nth-child(even) {
|
42
|
-
|
43
|
-
}
|
44
|
-
</style>
|
45
|
-
</head>
|
46
|
-
<body>
|
47
|
-
<table>
|
48
|
-
EOS
|
9
|
+
HTMLHEADER = <<~EOS
|
10
|
+
<!DOCTYPE html>
|
11
|
+
<html>
|
12
|
+
<head>
|
13
|
+
<title>HTML output "#FILENAME#"</title>
|
14
|
+
<meta charset="utf-8">
|
15
|
+
<style type="text/css">
|
16
|
+
body {
|
17
|
+
font-family:"Helvetica Neue", Helvetica, Arial, sans-serif;
|
18
|
+
font-size:12px;
|
19
|
+
}
|
20
|
+
table {
|
21
|
+
border:1px solid #69c;
|
22
|
+
border-collapse:collapse;
|
23
|
+
font-size:12px;
|
24
|
+
text-align:left;
|
25
|
+
width:480px;
|
26
|
+
}
|
27
|
+
th {
|
28
|
+
border-bottom:1px dashed #69c;
|
29
|
+
color:#039;
|
30
|
+
font-size:14px;
|
31
|
+
font-weight:normal;
|
32
|
+
padding:12px 17px;
|
33
|
+
}
|
34
|
+
td {
|
35
|
+
color:#669;
|
36
|
+
padding:7px 17px;
|
37
|
+
white-space: pre;
|
38
|
+
}
|
39
|
+
tbody tr:hover td {
|
40
|
+
background:#d0dafd;
|
41
|
+
color:#339;
|
42
|
+
}
|
43
|
+
tbody tr:nth-child(even) {
|
44
|
+
background:#e0eaff;
|
45
|
+
}
|
46
|
+
</style>
|
47
|
+
</head>
|
48
|
+
<body>
|
49
|
+
<table>
|
50
|
+
EOS
|
49
51
|
# @!visibility private
|
50
|
-
HTMLFOOTER =
|
51
|
-
</tbody>
|
52
|
-
</table>
|
53
|
-
<p>#COUNT# rows</p>
|
54
|
-
</body>
|
55
|
-
</html>
|
56
|
-
EOS
|
52
|
+
HTMLFOOTER = <<~EOS
|
53
|
+
</tbody>
|
54
|
+
</table>
|
55
|
+
<p>#COUNT# rows</p>
|
56
|
+
</body>
|
57
|
+
</html>
|
58
|
+
EOS
|
57
59
|
def open_file
|
58
60
|
@file << HTMLHEADER.gsub('#FILENAME#', @filename)
|
59
61
|
end
|
60
62
|
|
61
63
|
def header
|
62
64
|
return if @header.nil? || @header.empty?
|
65
|
+
|
63
66
|
@file << '<thead><tr>'
|
64
67
|
@file << @header.map { |f| '<th>' + f + '</th>' }.join
|
65
68
|
@file << "</tr></thead>\n<tbody>\n"
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'json'
|
2
4
|
module DocParser
|
3
5
|
# The JSONOutput class generates a JSON file containing all rows as seperate
|
@@ -11,7 +13,7 @@ module DocParser
|
|
11
13
|
end
|
12
14
|
|
13
15
|
def write_row(row)
|
14
|
-
|
16
|
+
raise MissingHeaderException if @header.nil? || @header.empty?
|
15
17
|
|
16
18
|
@file << ',' unless @file.pos <= 1
|
17
19
|
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module DocParser
|
2
4
|
# The MultiOutput output combines multiple outputs.
|
3
5
|
# It creates a CSV, HTML, YAML and XLSX Output file
|
@@ -9,7 +11,7 @@ module DocParser
|
|
9
11
|
class MultiOutput < Output
|
10
12
|
# All the possible outputs
|
11
13
|
OUTPUT_TYPES = { csv: CSVOutput, html: HTMLOutput, yml: YAMLOutput,
|
12
|
-
xlsx: XLSXOutput, json: JSONOutput }
|
14
|
+
xlsx: XLSXOutput, json: JSONOutput }.freeze
|
13
15
|
|
14
16
|
# @!visibility private
|
15
17
|
def initialize(**options)
|