docparser 0.1.6 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +1 -0
- data/Gemfile +2 -3
- data/README.md +1 -2
- data/docparser.gemspec +2 -4
- data/lib/docparser/document.rb +11 -1
- data/lib/docparser/output.rb +2 -2
- data/lib/docparser/output/csv_output.rb +0 -1
- data/lib/docparser/output/json_output.rb +1 -1
- data/lib/docparser/output/yaml_output.rb +1 -1
- data/lib/docparser/parser.rb +2 -5
- data/lib/docparser/version.rb +1 -1
- data/test/lib/docparser/document_test.rb +0 -6
- data/test/lib/docparser/parser_test.rb +1 -1
- data/test/test_helper.rb +1 -0
- metadata +7 -38
- data/lib/docparser/output/screen_output.rb +0 -37
- data/test/lib/docparser/output/screen_output_test.rb +0 -54
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 94ae9577e872555f07210deffe7c4ebc192ac617
|
4
|
+
data.tar.gz: cec0672efe1be1d5bab9d3bff313b4b5c8bc9513
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f1105cb8834ea0c03dd1e4672ae75792ef3a242b44a3a11d718ebcf444723ea7d0e42a64f23972c2c6faf1f7ddb5c7e10cb418094fad37d255c58ad3c1ebf0a2
|
7
|
+
data.tar.gz: 5afaaee3978d2ad2f50d980622540714645a02673f2ed9d3a645b415230141839dd32f31295c37a798427745a87bedb742a02c8eba11a6f379f0e6655514e58f
|
data/.travis.yml
CHANGED
data/Gemfile
CHANGED
@@ -1,13 +1,12 @@
|
|
1
|
-
ruby '2.0.0'
|
2
1
|
gemspec
|
3
2
|
|
4
3
|
source 'https://rubygems.org'
|
5
4
|
|
6
5
|
group :test do
|
7
|
-
gem 'minitest', '~> 5.0
|
6
|
+
gem 'minitest', '~> 5.3.0'
|
8
7
|
gem 'coveralls', require: false
|
9
8
|
gem 'rake'
|
10
|
-
gem 'rubocop', '~> 0.
|
9
|
+
gem 'rubocop', '~> 0.18.1'
|
11
10
|
gem 'simplecov', require: false
|
12
11
|
gem 'simple_mock'
|
13
12
|
end
|
data/README.md
CHANGED
@@ -5,7 +5,7 @@
|
|
5
5
|
|
6
6
|
DocParser is a web scraping/screen scraping tool.
|
7
7
|
|
8
|
-
You can use it to easily scrape
|
8
|
+
You can use it to easily scrape information out of HTML documents.
|
9
9
|
|
10
10
|
The gem is called [docparser](http://rubygems.org/gems/docparser).
|
11
11
|
You can find the documentation [here](http://rubydoc.info/github/jurriaan/docparser/).
|
@@ -13,7 +13,6 @@ You can find the documentation [here](http://rubydoc.info/github/jurriaan/docpar
|
|
13
13
|
## Features
|
14
14
|
|
15
15
|
- XPath and CSS support through Nokogiri
|
16
|
-
- Support for loading of URLs throug open-uri
|
17
16
|
- Support for parallel processing of the documents
|
18
17
|
- 6 Output formats:
|
19
18
|
* CSV
|
data/docparser.gemspec
CHANGED
@@ -19,11 +19,9 @@ Gem::Specification.new do |spec|
|
|
19
19
|
spec.require_paths = ['lib']
|
20
20
|
spec.extra_rdoc_files = ['README.md', 'LICENSE']
|
21
21
|
|
22
|
-
spec.add_runtime_dependency 'nokogiri', '~> 1.6.
|
23
|
-
spec.add_runtime_dependency 'parallel', '~> 0.
|
22
|
+
spec.add_runtime_dependency 'nokogiri', '~> 1.6.1'
|
23
|
+
spec.add_runtime_dependency 'parallel', '~> 0.9.1'
|
24
24
|
spec.add_runtime_dependency 'axlsx', '~> 2.0.1'
|
25
|
-
spec.add_runtime_dependency 'terminal-table', '~> 1.4.5'
|
26
|
-
spec.add_runtime_dependency 'pageme', '~> 0.0.3'
|
27
25
|
spec.add_runtime_dependency 'log4r', '~> 1.1.10'
|
28
26
|
|
29
27
|
spec.add_development_dependency 'yard'
|
data/lib/docparser/document.rb
CHANGED
@@ -4,7 +4,17 @@ module DocParser
|
|
4
4
|
# @see Parser
|
5
5
|
# @see Output
|
6
6
|
class Document
|
7
|
-
|
7
|
+
# @return [String] the filename of the current document
|
8
|
+
attr_reader :filename
|
9
|
+
|
10
|
+
# @return [Nokogiri::HTML::Document] a reference to the Nokogiri document
|
11
|
+
attr_reader :doc
|
12
|
+
|
13
|
+
# @return [String] the encoding of the document
|
14
|
+
attr_reader :encoding
|
15
|
+
|
16
|
+
# @return [Array] the results from this document
|
17
|
+
attr_reader :results
|
8
18
|
|
9
19
|
# @return [String] the source of the document
|
10
20
|
attr_reader :html
|
data/lib/docparser/output.rb
CHANGED
@@ -9,7 +9,7 @@ module DocParser
|
|
9
9
|
def initialize(filename: filename)
|
10
10
|
@rowcount = 0
|
11
11
|
@filename = filename
|
12
|
-
|
12
|
+
fail ArgumentError, 'Please specify a filename' if filename.empty?
|
13
13
|
@file = open filename, 'w'
|
14
14
|
classname = self.class.name.split('::').last
|
15
15
|
@logger = Log4r::Logger.new("docparser::output::#{classname}")
|
@@ -49,7 +49,7 @@ module DocParser
|
|
49
49
|
|
50
50
|
# Called when a row is added
|
51
51
|
def write_row(row)
|
52
|
-
|
52
|
+
fail NotImplementedError, 'No row writer defined'
|
53
53
|
end
|
54
54
|
|
55
55
|
# Called before closing the file
|
@@ -6,7 +6,7 @@ module DocParser
|
|
6
6
|
class YAMLOutput < Output
|
7
7
|
# @!visibility private
|
8
8
|
def write_row(row)
|
9
|
-
|
9
|
+
fail MissingHeaderException if @header.nil? || @header.length == 0
|
10
10
|
@doc ||= {}
|
11
11
|
|
12
12
|
0.upto(@header.length - 1) do |counter|
|
data/lib/docparser/parser.rb
CHANGED
@@ -1,6 +1,5 @@
|
|
1
1
|
require 'rubygems'
|
2
2
|
require 'bundler/setup'
|
3
|
-
require 'open-uri'
|
4
3
|
require 'parallel'
|
5
4
|
require 'set'
|
6
5
|
require 'log4r'
|
@@ -8,7 +7,6 @@ require 'log4r/formatter/patternformatter'
|
|
8
7
|
require 'docparser/version'
|
9
8
|
require 'docparser/output'
|
10
9
|
require 'docparser/document'
|
11
|
-
require 'docparser/output/screen_output.rb'
|
12
10
|
require 'docparser/output/csv_output.rb'
|
13
11
|
require 'docparser/output/html_output.rb'
|
14
12
|
require 'docparser/output/xlsx_output.rb'
|
@@ -86,7 +84,7 @@ module DocParser
|
|
86
84
|
elsif output.is_a?(Array) && output.all? { |o| o.is_a? Output }
|
87
85
|
@outputs = output
|
88
86
|
elsif !output.nil?
|
89
|
-
|
87
|
+
fail ArgumentError, 'Invalid outputs specified'
|
90
88
|
end
|
91
89
|
|
92
90
|
@resultsets = Array.new(@outputs.length) { Set.new }
|
@@ -95,7 +93,7 @@ module DocParser
|
|
95
93
|
def parallel_process(&block)
|
96
94
|
@logger.info "Starting #{@num_processes} processes"
|
97
95
|
option = RUBY_ENGINE == 'ruby' ? :in_processes : :in_threads
|
98
|
-
Parallel.map(@files,
|
96
|
+
Parallel.map(@files, option => @num_processes) do |file|
|
99
97
|
# :nocov: #
|
100
98
|
parse_doc(file, &block)
|
101
99
|
# :nocov: #
|
@@ -129,6 +127,5 @@ module DocParser
|
|
129
127
|
output.close
|
130
128
|
end
|
131
129
|
end
|
132
|
-
|
133
130
|
end
|
134
131
|
end
|
data/lib/docparser/version.rb
CHANGED
@@ -27,12 +27,6 @@ describe DocParser::Document do
|
|
27
27
|
doc.xpath_content('xmltest > test').must_equal('Character Data')
|
28
28
|
end
|
29
29
|
|
30
|
-
it 'should read remote contents' do
|
31
|
-
url = 'https://gist.github.com/jurriaan/3f2750aa546e3e6719cf/raw'
|
32
|
-
doc = DocParser::Document.new(filename: url, parser: @parser)
|
33
|
-
doc.html.must_equal(open(url).read)
|
34
|
-
end
|
35
|
-
|
36
30
|
it 'should use the correct encoding' do
|
37
31
|
file = File.join($SUPPORT_DIR, 'test_encoding.html')
|
38
32
|
file2 = File.join($SUPPORT_DIR, 'test_encoding2.html')
|
data/test/test_helper.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: docparser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jurriaan Pruis
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2014-03-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -16,28 +16,28 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: 1.6.
|
19
|
+
version: 1.6.1
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: 1.6.
|
26
|
+
version: 1.6.1
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: parallel
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
31
|
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: 0.
|
33
|
+
version: 0.9.1
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: 0.
|
40
|
+
version: 0.9.1
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: axlsx
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -52,34 +52,6 @@ dependencies:
|
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: 2.0.1
|
55
|
-
- !ruby/object:Gem::Dependency
|
56
|
-
name: terminal-table
|
57
|
-
requirement: !ruby/object:Gem::Requirement
|
58
|
-
requirements:
|
59
|
-
- - "~>"
|
60
|
-
- !ruby/object:Gem::Version
|
61
|
-
version: 1.4.5
|
62
|
-
type: :runtime
|
63
|
-
prerelease: false
|
64
|
-
version_requirements: !ruby/object:Gem::Requirement
|
65
|
-
requirements:
|
66
|
-
- - "~>"
|
67
|
-
- !ruby/object:Gem::Version
|
68
|
-
version: 1.4.5
|
69
|
-
- !ruby/object:Gem::Dependency
|
70
|
-
name: pageme
|
71
|
-
requirement: !ruby/object:Gem::Requirement
|
72
|
-
requirements:
|
73
|
-
- - "~>"
|
74
|
-
- !ruby/object:Gem::Version
|
75
|
-
version: 0.0.3
|
76
|
-
type: :runtime
|
77
|
-
prerelease: false
|
78
|
-
version_requirements: !ruby/object:Gem::Requirement
|
79
|
-
requirements:
|
80
|
-
- - "~>"
|
81
|
-
- !ruby/object:Gem::Version
|
82
|
-
version: 0.0.3
|
83
55
|
- !ruby/object:Gem::Dependency
|
84
56
|
name: log4r
|
85
57
|
requirement: !ruby/object:Gem::Requirement
|
@@ -164,7 +136,6 @@ files:
|
|
164
136
|
- lib/docparser/output/json_output.rb
|
165
137
|
- lib/docparser/output/multi_output.rb
|
166
138
|
- lib/docparser/output/nil_output.rb
|
167
|
-
- lib/docparser/output/screen_output.rb
|
168
139
|
- lib/docparser/output/xlsx_output.rb
|
169
140
|
- lib/docparser/output/yaml_output.rb
|
170
141
|
- lib/docparser/parser.rb
|
@@ -178,7 +149,6 @@ files:
|
|
178
149
|
- test/lib/docparser/output/json_output_test.rb
|
179
150
|
- test/lib/docparser/output/multi_output_test.rb
|
180
151
|
- test/lib/docparser/output/nil_output_test.rb
|
181
|
-
- test/lib/docparser/output/screen_output_test.rb
|
182
152
|
- test/lib/docparser/output/xlsx_output_test.rb
|
183
153
|
- test/lib/docparser/output/yaml_output_test.rb
|
184
154
|
- test/lib/docparser/output_test.rb
|
@@ -250,7 +220,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
250
220
|
version: '0'
|
251
221
|
requirements: []
|
252
222
|
rubyforge_project:
|
253
|
-
rubygems_version: 2.
|
223
|
+
rubygems_version: 2.2.2
|
254
224
|
signing_key:
|
255
225
|
specification_version: 4
|
256
226
|
summary: DocParser is a Ruby Gem for webscraping
|
@@ -264,7 +234,6 @@ test_files:
|
|
264
234
|
- test/lib/docparser/output/json_output_test.rb
|
265
235
|
- test/lib/docparser/output/multi_output_test.rb
|
266
236
|
- test/lib/docparser/output/nil_output_test.rb
|
267
|
-
- test/lib/docparser/output/screen_output_test.rb
|
268
237
|
- test/lib/docparser/output/xlsx_output_test.rb
|
269
238
|
- test/lib/docparser/output/yaml_output_test.rb
|
270
239
|
- test/lib/docparser/output_test.rb
|
@@ -1,37 +0,0 @@
|
|
1
|
-
require 'terminal-table'
|
2
|
-
require 'pageme'
|
3
|
-
module DocParser
|
4
|
-
# This Output can be used for debugging purposes.
|
5
|
-
|
6
|
-
# This output sends the results directly to the terminal and pipes all rows
|
7
|
-
# through a pager
|
8
|
-
# @see Output
|
9
|
-
class ScreenOutput < Output
|
10
|
-
# @!visibility private
|
11
|
-
|
12
|
-
include PageMe
|
13
|
-
|
14
|
-
def initialize
|
15
|
-
@tables = []
|
16
|
-
@rowcount = 0
|
17
|
-
end
|
18
|
-
|
19
|
-
def close
|
20
|
-
page do |p|
|
21
|
-
p.puts "Showing all #{@tables.length} rows:\n\n"
|
22
|
-
@tables.each do |table|
|
23
|
-
p.puts table
|
24
|
-
end
|
25
|
-
end
|
26
|
-
end
|
27
|
-
|
28
|
-
def write_row(row)
|
29
|
-
raise MissingHeaderException if @header.nil? || @header.length == 0
|
30
|
-
out = []
|
31
|
-
0.upto(@header.length - 1) do |counter|
|
32
|
-
out << [@header[counter], row[counter]]
|
33
|
-
end
|
34
|
-
@tables << Terminal::Table.new(rows: out)
|
35
|
-
end
|
36
|
-
end
|
37
|
-
end
|
@@ -1,54 +0,0 @@
|
|
1
|
-
require_relative '../../../test_helper'
|
2
|
-
require 'stringio'
|
3
|
-
describe DocParser::ScreenOutput do
|
4
|
-
before do
|
5
|
-
Log4r::Logger['docparser'].level = Log4r::ERROR
|
6
|
-
end
|
7
|
-
after do
|
8
|
-
Log4r::Logger['docparser'].level = Log4r::INFO
|
9
|
-
end
|
10
|
-
|
11
|
-
it 'should not create a file' do
|
12
|
-
Dir.mktmpdir do |dir|
|
13
|
-
filename = File.join(dir, '*')
|
14
|
-
DocParser::ScreenOutput.new
|
15
|
-
Dir[filename].must_be_empty
|
16
|
-
end
|
17
|
-
end
|
18
|
-
|
19
|
-
it 'must give the correct rowcount' do
|
20
|
-
output = DocParser::ScreenOutput.new
|
21
|
-
output.header = 'test', 'the', 'header'
|
22
|
-
output.rowcount.must_equal 0
|
23
|
-
output.add_row %w(aap noot mies)
|
24
|
-
output.add_row %w(aap noot mies)
|
25
|
-
output.rowcount.must_equal 2
|
26
|
-
end
|
27
|
-
|
28
|
-
it 'must have a header' do
|
29
|
-
output = DocParser::ScreenOutput.new
|
30
|
-
lambda do
|
31
|
-
output.add_row %w(aap noot mies)
|
32
|
-
end.must_raise(DocParser::MissingHeaderException)
|
33
|
-
end
|
34
|
-
|
35
|
-
it 'must output the data after close' do
|
36
|
-
$out = StringIO.new
|
37
|
-
output = Class.new DocParser::ScreenOutput do
|
38
|
-
def page(*args, &p)
|
39
|
-
args << p
|
40
|
-
args.compact!
|
41
|
-
page_to $out, args
|
42
|
-
end
|
43
|
-
end.new
|
44
|
-
output.header = 'test', 'the', 'header'
|
45
|
-
output.add_row ['aap1' , '', 'mies']
|
46
|
-
output.add_row %w(aap2 mies1)
|
47
|
-
output.close
|
48
|
-
out = $out.string
|
49
|
-
out.must_include 'header'
|
50
|
-
out.must_include 'aap1'
|
51
|
-
out.must_include 'mies1'
|
52
|
-
out.must_include 'mies'
|
53
|
-
end
|
54
|
-
end
|