docparser 0.2.3 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.rubocop.yml +15 -3
- data/.rubocop_todo.yml +45 -0
- data/.travis.yml +1 -1
- data/Gemfile +5 -4
- data/README.md +2 -2
- data/Rakefile +3 -1
- data/docparser.gemspec +9 -9
- data/example.rb +2 -0
- data/lib/docparser.rb +2 -0
- data/lib/docparser/document.rb +20 -10
- data/lib/docparser/output.rb +9 -6
- data/lib/docparser/output/csv_output.rb +2 -0
- data/lib/docparser/output/html_output.rb +52 -49
- data/lib/docparser/output/json_output.rb +3 -1
- data/lib/docparser/output/multi_output.rb +3 -1
- data/lib/docparser/output/nil_output.rb +5 -6
- data/lib/docparser/output/xlsx_output.rb +2 -0
- data/lib/docparser/output/yaml_output.rb +4 -1
- data/lib/docparser/parser.rb +9 -13
- data/lib/docparser/version.rb +3 -1
- data/test/.rubocop.yml +6 -2
- data/test/.rubocop_todo.yml +23 -0
- data/test/lib/docparser/blackbox_test.rb +5 -4
- data/test/lib/docparser/document_test.rb +19 -14
- data/test/lib/docparser/output/csv_output_test.rb +5 -10
- data/test/lib/docparser/output/html_output_test.rb +5 -10
- data/test/lib/docparser/output/json_output_test.rb +8 -13
- data/test/lib/docparser/output/multi_output_test.rb +6 -12
- data/test/lib/docparser/output/nil_output_test.rb +4 -9
- data/test/lib/docparser/output/xlsx_output_test.rb +5 -10
- data/test/lib/docparser/output/yaml_output_test.rb +22 -27
- data/test/lib/docparser/output_test.rb +3 -8
- data/test/lib/docparser/parser_test.rb +2 -22
- data/test/lib/docparser/version_test.rb +2 -0
- data/test/support/hackaday/dl.rb +2 -0
- data/test/test_helper.rb +2 -3
- metadata +20 -35
- data/test/lib/docparser/logging_test.rb +0 -19
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 3a857af8eb8403f2f26f867f02e13319fcb173c6f177947986d5fe81e7248a74
|
4
|
+
data.tar.gz: 53e66974c57f2662606f1125a0466ca0ce7db476cae0516da71837b3703263e4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 70c5721acae9866e862a6747a6749c894bb38dd20792545ad372b8f5265f9a4ca2f5a895e30fe36fc17eb2e511ffe86ec837974d2430ee1864b2b6773ce08b6c
|
7
|
+
data.tar.gz: 0b7cd82a97ad79b78bebab10319403cb6cd9cbee27137912315ede6ccbaa432f87d9a9267123724021f119c9f3d67de73dd57350455d3c00bd63132047b1f57c
|
data/.rubocop.yml
CHANGED
@@ -1,6 +1,18 @@
|
|
1
|
+
inherit_from: .rubocop_todo.yml
|
2
|
+
|
1
3
|
# Temporary turn this off
|
2
4
|
# Avoid parameter lists longer than three or four parameters.
|
3
|
-
ParameterLists:
|
5
|
+
Metrics/ParameterLists:
|
4
6
|
Enabled: false
|
5
|
-
MultilineBlockChain:
|
6
|
-
Enabled: false
|
7
|
+
Style/MultilineBlockChain:
|
8
|
+
Enabled: false
|
9
|
+
Style/HashTransformKeys:
|
10
|
+
Enabled: true
|
11
|
+
Style/HashTransformValues:
|
12
|
+
Enabled: true
|
13
|
+
Style/HashEachMethods:
|
14
|
+
Enabled: true
|
15
|
+
Lint/RaiseException:
|
16
|
+
Enabled: true
|
17
|
+
Lint/StructNewOverride:
|
18
|
+
Enabled: true
|
data/.rubocop_todo.yml
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
# This configuration was generated by
|
2
|
+
# `rubocop --auto-gen-config`
|
3
|
+
# on 2020-04-13 17:55:59 +0200 using RuboCop version 0.81.0.
|
4
|
+
# The point is for the user to remove these configuration records
|
5
|
+
# one by one as the offenses are removed from the code base.
|
6
|
+
# Note that changes in the inspected code, or installation of new
|
7
|
+
# versions of RuboCop, may require this file to be generated again.
|
8
|
+
|
9
|
+
# Offense count: 9
|
10
|
+
# Configuration parameters: CountComments, ExcludedMethods.
|
11
|
+
# ExcludedMethods: refine
|
12
|
+
Metrics/BlockLength:
|
13
|
+
Max: 173
|
14
|
+
|
15
|
+
# Offense count: 2
|
16
|
+
# Configuration parameters: ForbiddenDelimiters.
|
17
|
+
# ForbiddenDelimiters: (?-mix:(^|\s)(EO[A-Z]{1}|END)(\s|$))
|
18
|
+
Naming/HeredocDelimiterNaming:
|
19
|
+
Exclude:
|
20
|
+
- 'lib/docparser/output/html_output.rb'
|
21
|
+
|
22
|
+
# Offense count: 16
|
23
|
+
Security/Open:
|
24
|
+
Exclude:
|
25
|
+
- 'lib/docparser/document.rb'
|
26
|
+
- 'lib/docparser/output.rb'
|
27
|
+
- 'test/lib/docparser/document_test.rb'
|
28
|
+
- 'test/lib/docparser/output/csv_output_test.rb'
|
29
|
+
- 'test/lib/docparser/output/html_output_test.rb'
|
30
|
+
- 'test/lib/docparser/output/json_output_test.rb'
|
31
|
+
- 'test/lib/docparser/output/multi_output_test.rb'
|
32
|
+
- 'test/lib/docparser/output/yaml_output_test.rb'
|
33
|
+
|
34
|
+
# Offense count: 4
|
35
|
+
# Configuration parameters: EnforcedStyle.
|
36
|
+
# SupportedStyles: annotated, template, unannotated
|
37
|
+
Style/FormatStringToken:
|
38
|
+
Exclude:
|
39
|
+
- 'lib/docparser/output.rb'
|
40
|
+
- 'lib/docparser/parser.rb'
|
41
|
+
|
42
|
+
# Offense count: 1
|
43
|
+
Style/MixinUsage:
|
44
|
+
Exclude:
|
45
|
+
- 'example.rb'
|
data/.travis.yml
CHANGED
data/Gemfile
CHANGED
@@ -1,12 +1,13 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
gemspec
|
2
4
|
|
3
5
|
source 'https://rubygems.org'
|
4
6
|
|
5
7
|
group :test do
|
6
|
-
gem 'minitest', '~> 5.
|
7
|
-
gem 'coveralls', require: false
|
8
|
+
gem 'minitest', '~> 5.14.0'
|
8
9
|
gem 'rake'
|
9
|
-
gem 'rubocop', '~> 0.
|
10
|
-
gem 'simplecov', require: false
|
10
|
+
gem 'rubocop', '~> 0.81.0'
|
11
11
|
gem 'simple_mock'
|
12
|
+
gem 'simplecov', require: false
|
12
13
|
end
|
data/README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# DocParser
|
2
2
|
|
3
|
-
[](http://badge.fury.io/rb/docparser) [](https://travis-ci.org/jurriaan/docparser) [](https://gemnasium.com/jurriaan/docparser)
|
4
4
|
|
5
5
|
|
6
6
|
DocParser is a web scraping/screen scraping tool.
|
@@ -59,4 +59,4 @@ See [example.rb](https://github.com/jurriaan/docparser/blob/master/example.rb)
|
|
59
59
|
|
60
60
|
## Thanks
|
61
61
|
|
62
|
-
- [randym](https://github.com/randym) - for providing the [axlsx](https://github.com/randym/axlsx) gem
|
62
|
+
- [randym](https://github.com/randym) - for providing the [axlsx](https://github.com/randym/axlsx) gem
|
data/Rakefile
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'bundler/gem_tasks'
|
2
4
|
require 'rake/testtask'
|
3
5
|
require 'rubocop'
|
@@ -16,7 +18,7 @@ task :rubocop do
|
|
16
18
|
puts "Running Rubocop #{RuboCop::Version::STRING}"
|
17
19
|
args = FileList['**/*.rb', 'Rakefile', 'docparser.gemspec', 'Gemfile']
|
18
20
|
cli = RuboCop::CLI.new
|
19
|
-
|
21
|
+
raise unless cli.run(args).zero?
|
20
22
|
end
|
21
23
|
|
22
24
|
task default: :test
|
data/docparser.gemspec
CHANGED
@@ -1,4 +1,6 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
lib = File.expand_path('lib', __dir__)
|
2
4
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
3
5
|
require 'docparser/version'
|
4
6
|
|
@@ -14,18 +16,16 @@ Gem::Specification.new do |spec|
|
|
14
16
|
spec.platform = Gem::Platform::RUBY
|
15
17
|
|
16
18
|
spec.files = `git ls-files`.split($RS)
|
17
|
-
spec.executables = spec.files.grep(
|
18
|
-
spec.test_files = spec.files.grep(
|
19
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
20
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
21
|
spec.require_paths = ['lib']
|
20
22
|
spec.extra_rdoc_files = ['README.md', 'LICENSE']
|
21
23
|
|
22
|
-
spec.add_runtime_dependency 'nokogiri', '~> 1.6.1'
|
23
|
-
spec.add_runtime_dependency 'parallel', '~> 1.3.2'
|
24
24
|
spec.add_runtime_dependency 'axlsx', '~> 2.0.1'
|
25
|
-
spec.add_runtime_dependency '
|
25
|
+
spec.add_runtime_dependency 'nokogiri', '~> 1.10.0'
|
26
|
+
spec.add_runtime_dependency 'parallel', '~> 1.10'
|
26
27
|
|
27
|
-
spec.add_development_dependency 'yard'
|
28
|
-
spec.add_development_dependency 'kramdown', '~> 1.4.1'
|
29
28
|
spec.add_development_dependency 'github-markup'
|
30
|
-
spec.
|
29
|
+
spec.add_development_dependency 'kramdown', '~> 2.1.0'
|
30
|
+
spec.add_development_dependency 'yard'
|
31
31
|
end
|
data/example.rb
CHANGED
data/lib/docparser.rb
CHANGED
data/lib/docparser/document.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'nokogiri'
|
2
4
|
module DocParser
|
3
5
|
# The Document class loads and parses the files.
|
@@ -19,9 +21,10 @@ module DocParser
|
|
19
21
|
# @return [String] the source of the document
|
20
22
|
attr_reader :html
|
21
23
|
|
22
|
-
def initialize(filename: nil, encoding: 'utf-8', parser: nil)
|
23
|
-
@logger =
|
24
|
-
@logger.
|
24
|
+
def initialize(filename: nil, encoding: 'utf-8', parser: nil, logger: nil)
|
25
|
+
@logger = logger || Logger.new(STDERR)
|
26
|
+
@logger.level = Logger::INFO
|
27
|
+
@logger.debug("Parsing #{filename}")
|
25
28
|
@encoding = encoding
|
26
29
|
@parser = parser
|
27
30
|
@filename = filename
|
@@ -32,7 +35,7 @@ module DocParser
|
|
32
35
|
# Adds a row to an output
|
33
36
|
def add_row(*row, output: 0)
|
34
37
|
output = @parser.outputs.index(output) if output.is_a? Output
|
35
|
-
@logger.debug
|
38
|
+
@logger.debug("#{filename}: Adding row #{row.flatten}")
|
36
39
|
results[output] << row.flatten
|
37
40
|
end
|
38
41
|
|
@@ -42,9 +45,14 @@ module DocParser
|
|
42
45
|
@title ||= xpath_content('//head/title')
|
43
46
|
end
|
44
47
|
|
45
|
-
# Executes a xpath query
|
46
|
-
def
|
47
|
-
|
48
|
+
# Executes a xpath/css query
|
49
|
+
def elements(query)
|
50
|
+
@doc.search(query)
|
51
|
+
end
|
52
|
+
|
53
|
+
def each_element(query)
|
54
|
+
res = elements(query)
|
55
|
+
|
48
56
|
if block_given?
|
49
57
|
res.each { |el| yield el }
|
50
58
|
else
|
@@ -54,7 +62,7 @@ module DocParser
|
|
54
62
|
|
55
63
|
# Executes a xpath query and returns the content
|
56
64
|
# @return [String] the content of the HTML node
|
57
|
-
def
|
65
|
+
def element_content(query)
|
58
66
|
first = @doc.search(query).first
|
59
67
|
if first.nil?
|
60
68
|
nil
|
@@ -91,7 +99,9 @@ module DocParser
|
|
91
99
|
end
|
92
100
|
end
|
93
101
|
|
94
|
-
|
95
|
-
|
102
|
+
alias css each_element
|
103
|
+
alias xpath each_element
|
104
|
+
alias css_content element_content
|
105
|
+
alias xpath_content element_content
|
96
106
|
end
|
97
107
|
end
|
data/lib/docparser/output.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module DocParser
|
2
4
|
# The Output base class.
|
3
5
|
# All Output classes inherit from this one.
|
@@ -24,10 +26,11 @@ module DocParser
|
|
24
26
|
@filename = filename
|
25
27
|
@uniq = uniq
|
26
28
|
@uniqarr = []
|
27
|
-
|
29
|
+
raise ArgumentError, 'Please specify a filename' if filename.empty?
|
30
|
+
|
28
31
|
@file = open filename, 'w'
|
29
|
-
|
30
|
-
@logger =
|
32
|
+
@logger = Logger.new(STDERR)
|
33
|
+
@logger.level = Logger::INFO
|
31
34
|
open_file
|
32
35
|
end
|
33
36
|
|
@@ -40,6 +43,7 @@ module DocParser
|
|
40
43
|
# Adds a row
|
41
44
|
def add_row(row)
|
42
45
|
return if @uniq && @uniqarr.include?(row.hash)
|
46
|
+
|
43
47
|
@rowcount += 1
|
44
48
|
write_row row
|
45
49
|
@uniqarr << row.hash
|
@@ -66,12 +70,11 @@ module DocParser
|
|
66
70
|
|
67
71
|
# Called when a row is added
|
68
72
|
def write_row(_row)
|
69
|
-
|
73
|
+
raise NotImplementedError, 'No row writer defined'
|
70
74
|
end
|
71
75
|
|
72
76
|
# Called before closing the file
|
73
|
-
def footer
|
74
|
-
end
|
77
|
+
def footer; end
|
75
78
|
end
|
76
79
|
|
77
80
|
# MissingHeaderException gets thrown if a required header is missing.
|
@@ -1,65 +1,68 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'cgi'
|
2
4
|
module DocParser
|
3
5
|
# The XLSXOutput class generates an HTML file containing a table
|
4
6
|
# @see Output
|
5
7
|
class HTMLOutput < Output
|
6
8
|
# @!visibility private
|
7
|
-
HTMLHEADER =
|
8
|
-
<!DOCTYPE html>
|
9
|
-
<html>
|
10
|
-
<head>
|
11
|
-
<title>HTML output "#FILENAME#"</title>
|
12
|
-
<meta charset="utf-8">
|
13
|
-
<style type="text/css">
|
14
|
-
body {
|
15
|
-
|
16
|
-
|
17
|
-
}
|
18
|
-
table {
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
}
|
25
|
-
th {
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
}
|
32
|
-
td {
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
}
|
37
|
-
tbody tr:hover td {
|
38
|
-
|
39
|
-
|
40
|
-
}
|
41
|
-
tbody tr:nth-child(even) {
|
42
|
-
|
43
|
-
}
|
44
|
-
</style>
|
45
|
-
</head>
|
46
|
-
<body>
|
47
|
-
<table>
|
48
|
-
EOS
|
9
|
+
HTMLHEADER = <<~EOS
|
10
|
+
<!DOCTYPE html>
|
11
|
+
<html>
|
12
|
+
<head>
|
13
|
+
<title>HTML output "#FILENAME#"</title>
|
14
|
+
<meta charset="utf-8">
|
15
|
+
<style type="text/css">
|
16
|
+
body {
|
17
|
+
font-family:"Helvetica Neue", Helvetica, Arial, sans-serif;
|
18
|
+
font-size:12px;
|
19
|
+
}
|
20
|
+
table {
|
21
|
+
border:1px solid #69c;
|
22
|
+
border-collapse:collapse;
|
23
|
+
font-size:12px;
|
24
|
+
text-align:left;
|
25
|
+
width:480px;
|
26
|
+
}
|
27
|
+
th {
|
28
|
+
border-bottom:1px dashed #69c;
|
29
|
+
color:#039;
|
30
|
+
font-size:14px;
|
31
|
+
font-weight:normal;
|
32
|
+
padding:12px 17px;
|
33
|
+
}
|
34
|
+
td {
|
35
|
+
color:#669;
|
36
|
+
padding:7px 17px;
|
37
|
+
white-space: pre;
|
38
|
+
}
|
39
|
+
tbody tr:hover td {
|
40
|
+
background:#d0dafd;
|
41
|
+
color:#339;
|
42
|
+
}
|
43
|
+
tbody tr:nth-child(even) {
|
44
|
+
background:#e0eaff;
|
45
|
+
}
|
46
|
+
</style>
|
47
|
+
</head>
|
48
|
+
<body>
|
49
|
+
<table>
|
50
|
+
EOS
|
49
51
|
# @!visibility private
|
50
|
-
HTMLFOOTER =
|
51
|
-
</tbody>
|
52
|
-
</table>
|
53
|
-
<p>#COUNT# rows</p>
|
54
|
-
</body>
|
55
|
-
</html>
|
56
|
-
EOS
|
52
|
+
HTMLFOOTER = <<~EOS
|
53
|
+
</tbody>
|
54
|
+
</table>
|
55
|
+
<p>#COUNT# rows</p>
|
56
|
+
</body>
|
57
|
+
</html>
|
58
|
+
EOS
|
57
59
|
def open_file
|
58
60
|
@file << HTMLHEADER.gsub('#FILENAME#', @filename)
|
59
61
|
end
|
60
62
|
|
61
63
|
def header
|
62
64
|
return if @header.nil? || @header.empty?
|
65
|
+
|
63
66
|
@file << '<thead><tr>'
|
64
67
|
@file << @header.map { |f| '<th>' + f + '</th>' }.join
|
65
68
|
@file << "</tr></thead>\n<tbody>\n"
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'json'
|
2
4
|
module DocParser
|
3
5
|
# The JSONOutput class generates a JSON file containing all rows as seperate
|
@@ -11,7 +13,7 @@ module DocParser
|
|
11
13
|
end
|
12
14
|
|
13
15
|
def write_row(row)
|
14
|
-
|
16
|
+
raise MissingHeaderException if @header.nil? || @header.empty?
|
15
17
|
|
16
18
|
@file << ',' unless @file.pos <= 1
|
17
19
|
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module DocParser
|
2
4
|
# The MultiOutput output combines multiple outputs.
|
3
5
|
# It creates a CSV, HTML, YAML and XLSX Output file
|
@@ -9,7 +11,7 @@ module DocParser
|
|
9
11
|
class MultiOutput < Output
|
10
12
|
# All the possible outputs
|
11
13
|
OUTPUT_TYPES = { csv: CSVOutput, html: HTMLOutput, yml: YAMLOutput,
|
12
|
-
xlsx: XLSXOutput, json: JSONOutput }
|
14
|
+
xlsx: XLSXOutput, json: JSONOutput }.freeze
|
13
15
|
|
14
16
|
# @!visibility private
|
15
17
|
def initialize(**options)
|