docparser 0.1.6 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +1 -0
- data/Gemfile +2 -3
- data/README.md +1 -2
- data/docparser.gemspec +2 -4
- data/lib/docparser/document.rb +11 -1
- data/lib/docparser/output.rb +2 -2
- data/lib/docparser/output/csv_output.rb +0 -1
- data/lib/docparser/output/json_output.rb +1 -1
- data/lib/docparser/output/yaml_output.rb +1 -1
- data/lib/docparser/parser.rb +2 -5
- data/lib/docparser/version.rb +1 -1
- data/test/lib/docparser/document_test.rb +0 -6
- data/test/lib/docparser/parser_test.rb +1 -1
- data/test/test_helper.rb +1 -0
- metadata +7 -38
- data/lib/docparser/output/screen_output.rb +0 -37
- data/test/lib/docparser/output/screen_output_test.rb +0 -54
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 94ae9577e872555f07210deffe7c4ebc192ac617
|
4
|
+
data.tar.gz: cec0672efe1be1d5bab9d3bff313b4b5c8bc9513
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f1105cb8834ea0c03dd1e4672ae75792ef3a242b44a3a11d718ebcf444723ea7d0e42a64f23972c2c6faf1f7ddb5c7e10cb418094fad37d255c58ad3c1ebf0a2
|
7
|
+
data.tar.gz: 5afaaee3978d2ad2f50d980622540714645a02673f2ed9d3a645b415230141839dd32f31295c37a798427745a87bedb742a02c8eba11a6f379f0e6655514e58f
|
data/.travis.yml
CHANGED
data/Gemfile
CHANGED
@@ -1,13 +1,12 @@
|
|
1
|
-
ruby '2.0.0'
|
2
1
|
gemspec
|
3
2
|
|
4
3
|
source 'https://rubygems.org'
|
5
4
|
|
6
5
|
group :test do
|
7
|
-
gem 'minitest', '~> 5.0
|
6
|
+
gem 'minitest', '~> 5.3.0'
|
8
7
|
gem 'coveralls', require: false
|
9
8
|
gem 'rake'
|
10
|
-
gem 'rubocop', '~> 0.
|
9
|
+
gem 'rubocop', '~> 0.18.1'
|
11
10
|
gem 'simplecov', require: false
|
12
11
|
gem 'simple_mock'
|
13
12
|
end
|
data/README.md
CHANGED
@@ -5,7 +5,7 @@
|
|
5
5
|
|
6
6
|
DocParser is a web scraping/screen scraping tool.
|
7
7
|
|
8
|
-
You can use it to easily scrape
|
8
|
+
You can use it to easily scrape information out of HTML documents.
|
9
9
|
|
10
10
|
The gem is called [docparser](http://rubygems.org/gems/docparser).
|
11
11
|
You can find the documentation [here](http://rubydoc.info/github/jurriaan/docparser/).
|
@@ -13,7 +13,6 @@ You can find the documentation [here](http://rubydoc.info/github/jurriaan/docpar
|
|
13
13
|
## Features
|
14
14
|
|
15
15
|
- XPath and CSS support through Nokogiri
|
16
|
-
- Support for loading of URLs throug open-uri
|
17
16
|
- Support for parallel processing of the documents
|
18
17
|
- 6 Output formats:
|
19
18
|
* CSV
|
data/docparser.gemspec
CHANGED
@@ -19,11 +19,9 @@ Gem::Specification.new do |spec|
|
|
19
19
|
spec.require_paths = ['lib']
|
20
20
|
spec.extra_rdoc_files = ['README.md', 'LICENSE']
|
21
21
|
|
22
|
-
spec.add_runtime_dependency 'nokogiri', '~> 1.6.
|
23
|
-
spec.add_runtime_dependency 'parallel', '~> 0.
|
22
|
+
spec.add_runtime_dependency 'nokogiri', '~> 1.6.1'
|
23
|
+
spec.add_runtime_dependency 'parallel', '~> 0.9.1'
|
24
24
|
spec.add_runtime_dependency 'axlsx', '~> 2.0.1'
|
25
|
-
spec.add_runtime_dependency 'terminal-table', '~> 1.4.5'
|
26
|
-
spec.add_runtime_dependency 'pageme', '~> 0.0.3'
|
27
25
|
spec.add_runtime_dependency 'log4r', '~> 1.1.10'
|
28
26
|
|
29
27
|
spec.add_development_dependency 'yard'
|
data/lib/docparser/document.rb
CHANGED
@@ -4,7 +4,17 @@ module DocParser
|
|
4
4
|
# @see Parser
|
5
5
|
# @see Output
|
6
6
|
class Document
|
7
|
-
|
7
|
+
# @return [String] the filename of the current document
|
8
|
+
attr_reader :filename
|
9
|
+
|
10
|
+
# @return [Nokogiri::HTML::Document] a reference to the Nokogiri document
|
11
|
+
attr_reader :doc
|
12
|
+
|
13
|
+
# @return [String] the encoding of the document
|
14
|
+
attr_reader :encoding
|
15
|
+
|
16
|
+
# @return [Array] the results from this document
|
17
|
+
attr_reader :results
|
8
18
|
|
9
19
|
# @return [String] the source of the document
|
10
20
|
attr_reader :html
|
data/lib/docparser/output.rb
CHANGED
@@ -9,7 +9,7 @@ module DocParser
|
|
9
9
|
def initialize(filename: filename)
|
10
10
|
@rowcount = 0
|
11
11
|
@filename = filename
|
12
|
-
|
12
|
+
fail ArgumentError, 'Please specify a filename' if filename.empty?
|
13
13
|
@file = open filename, 'w'
|
14
14
|
classname = self.class.name.split('::').last
|
15
15
|
@logger = Log4r::Logger.new("docparser::output::#{classname}")
|
@@ -49,7 +49,7 @@ module DocParser
|
|
49
49
|
|
50
50
|
# Called when a row is added
|
51
51
|
def write_row(row)
|
52
|
-
|
52
|
+
fail NotImplementedError, 'No row writer defined'
|
53
53
|
end
|
54
54
|
|
55
55
|
# Called before closing the file
|
@@ -6,7 +6,7 @@ module DocParser
|
|
6
6
|
class YAMLOutput < Output
|
7
7
|
# @!visibility private
|
8
8
|
def write_row(row)
|
9
|
-
|
9
|
+
fail MissingHeaderException if @header.nil? || @header.length == 0
|
10
10
|
@doc ||= {}
|
11
11
|
|
12
12
|
0.upto(@header.length - 1) do |counter|
|
data/lib/docparser/parser.rb
CHANGED
@@ -1,6 +1,5 @@
|
|
1
1
|
require 'rubygems'
|
2
2
|
require 'bundler/setup'
|
3
|
-
require 'open-uri'
|
4
3
|
require 'parallel'
|
5
4
|
require 'set'
|
6
5
|
require 'log4r'
|
@@ -8,7 +7,6 @@ require 'log4r/formatter/patternformatter'
|
|
8
7
|
require 'docparser/version'
|
9
8
|
require 'docparser/output'
|
10
9
|
require 'docparser/document'
|
11
|
-
require 'docparser/output/screen_output.rb'
|
12
10
|
require 'docparser/output/csv_output.rb'
|
13
11
|
require 'docparser/output/html_output.rb'
|
14
12
|
require 'docparser/output/xlsx_output.rb'
|
@@ -86,7 +84,7 @@ module DocParser
|
|
86
84
|
elsif output.is_a?(Array) && output.all? { |o| o.is_a? Output }
|
87
85
|
@outputs = output
|
88
86
|
elsif !output.nil?
|
89
|
-
|
87
|
+
fail ArgumentError, 'Invalid outputs specified'
|
90
88
|
end
|
91
89
|
|
92
90
|
@resultsets = Array.new(@outputs.length) { Set.new }
|
@@ -95,7 +93,7 @@ module DocParser
|
|
95
93
|
def parallel_process(&block)
|
96
94
|
@logger.info "Starting #{@num_processes} processes"
|
97
95
|
option = RUBY_ENGINE == 'ruby' ? :in_processes : :in_threads
|
98
|
-
Parallel.map(@files,
|
96
|
+
Parallel.map(@files, option => @num_processes) do |file|
|
99
97
|
# :nocov: #
|
100
98
|
parse_doc(file, &block)
|
101
99
|
# :nocov: #
|
@@ -129,6 +127,5 @@ module DocParser
|
|
129
127
|
output.close
|
130
128
|
end
|
131
129
|
end
|
132
|
-
|
133
130
|
end
|
134
131
|
end
|
data/lib/docparser/version.rb
CHANGED
@@ -27,12 +27,6 @@ describe DocParser::Document do
|
|
27
27
|
doc.xpath_content('xmltest > test').must_equal('Character Data')
|
28
28
|
end
|
29
29
|
|
30
|
-
it 'should read remote contents' do
|
31
|
-
url = 'https://gist.github.com/jurriaan/3f2750aa546e3e6719cf/raw'
|
32
|
-
doc = DocParser::Document.new(filename: url, parser: @parser)
|
33
|
-
doc.html.must_equal(open(url).read)
|
34
|
-
end
|
35
|
-
|
36
30
|
it 'should use the correct encoding' do
|
37
31
|
file = File.join($SUPPORT_DIR, 'test_encoding.html')
|
38
32
|
file2 = File.join($SUPPORT_DIR, 'test_encoding2.html')
|
data/test/test_helper.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: docparser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jurriaan Pruis
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2014-03-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -16,28 +16,28 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: 1.6.
|
19
|
+
version: 1.6.1
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: 1.6.
|
26
|
+
version: 1.6.1
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: parallel
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
31
|
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: 0.
|
33
|
+
version: 0.9.1
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: 0.
|
40
|
+
version: 0.9.1
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: axlsx
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -52,34 +52,6 @@ dependencies:
|
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: 2.0.1
|
55
|
-
- !ruby/object:Gem::Dependency
|
56
|
-
name: terminal-table
|
57
|
-
requirement: !ruby/object:Gem::Requirement
|
58
|
-
requirements:
|
59
|
-
- - "~>"
|
60
|
-
- !ruby/object:Gem::Version
|
61
|
-
version: 1.4.5
|
62
|
-
type: :runtime
|
63
|
-
prerelease: false
|
64
|
-
version_requirements: !ruby/object:Gem::Requirement
|
65
|
-
requirements:
|
66
|
-
- - "~>"
|
67
|
-
- !ruby/object:Gem::Version
|
68
|
-
version: 1.4.5
|
69
|
-
- !ruby/object:Gem::Dependency
|
70
|
-
name: pageme
|
71
|
-
requirement: !ruby/object:Gem::Requirement
|
72
|
-
requirements:
|
73
|
-
- - "~>"
|
74
|
-
- !ruby/object:Gem::Version
|
75
|
-
version: 0.0.3
|
76
|
-
type: :runtime
|
77
|
-
prerelease: false
|
78
|
-
version_requirements: !ruby/object:Gem::Requirement
|
79
|
-
requirements:
|
80
|
-
- - "~>"
|
81
|
-
- !ruby/object:Gem::Version
|
82
|
-
version: 0.0.3
|
83
55
|
- !ruby/object:Gem::Dependency
|
84
56
|
name: log4r
|
85
57
|
requirement: !ruby/object:Gem::Requirement
|
@@ -164,7 +136,6 @@ files:
|
|
164
136
|
- lib/docparser/output/json_output.rb
|
165
137
|
- lib/docparser/output/multi_output.rb
|
166
138
|
- lib/docparser/output/nil_output.rb
|
167
|
-
- lib/docparser/output/screen_output.rb
|
168
139
|
- lib/docparser/output/xlsx_output.rb
|
169
140
|
- lib/docparser/output/yaml_output.rb
|
170
141
|
- lib/docparser/parser.rb
|
@@ -178,7 +149,6 @@ files:
|
|
178
149
|
- test/lib/docparser/output/json_output_test.rb
|
179
150
|
- test/lib/docparser/output/multi_output_test.rb
|
180
151
|
- test/lib/docparser/output/nil_output_test.rb
|
181
|
-
- test/lib/docparser/output/screen_output_test.rb
|
182
152
|
- test/lib/docparser/output/xlsx_output_test.rb
|
183
153
|
- test/lib/docparser/output/yaml_output_test.rb
|
184
154
|
- test/lib/docparser/output_test.rb
|
@@ -250,7 +220,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
250
220
|
version: '0'
|
251
221
|
requirements: []
|
252
222
|
rubyforge_project:
|
253
|
-
rubygems_version: 2.
|
223
|
+
rubygems_version: 2.2.2
|
254
224
|
signing_key:
|
255
225
|
specification_version: 4
|
256
226
|
summary: DocParser is a Ruby Gem for webscraping
|
@@ -264,7 +234,6 @@ test_files:
|
|
264
234
|
- test/lib/docparser/output/json_output_test.rb
|
265
235
|
- test/lib/docparser/output/multi_output_test.rb
|
266
236
|
- test/lib/docparser/output/nil_output_test.rb
|
267
|
-
- test/lib/docparser/output/screen_output_test.rb
|
268
237
|
- test/lib/docparser/output/xlsx_output_test.rb
|
269
238
|
- test/lib/docparser/output/yaml_output_test.rb
|
270
239
|
- test/lib/docparser/output_test.rb
|
@@ -1,37 +0,0 @@
|
|
1
|
-
require 'terminal-table'
|
2
|
-
require 'pageme'
|
3
|
-
module DocParser
|
4
|
-
# This Output can be used for debugging purposes.
|
5
|
-
|
6
|
-
# This output sends the results directly to the terminal and pipes all rows
|
7
|
-
# through a pager
|
8
|
-
# @see Output
|
9
|
-
class ScreenOutput < Output
|
10
|
-
# @!visibility private
|
11
|
-
|
12
|
-
include PageMe
|
13
|
-
|
14
|
-
def initialize
|
15
|
-
@tables = []
|
16
|
-
@rowcount = 0
|
17
|
-
end
|
18
|
-
|
19
|
-
def close
|
20
|
-
page do |p|
|
21
|
-
p.puts "Showing all #{@tables.length} rows:\n\n"
|
22
|
-
@tables.each do |table|
|
23
|
-
p.puts table
|
24
|
-
end
|
25
|
-
end
|
26
|
-
end
|
27
|
-
|
28
|
-
def write_row(row)
|
29
|
-
raise MissingHeaderException if @header.nil? || @header.length == 0
|
30
|
-
out = []
|
31
|
-
0.upto(@header.length - 1) do |counter|
|
32
|
-
out << [@header[counter], row[counter]]
|
33
|
-
end
|
34
|
-
@tables << Terminal::Table.new(rows: out)
|
35
|
-
end
|
36
|
-
end
|
37
|
-
end
|
@@ -1,54 +0,0 @@
|
|
1
|
-
require_relative '../../../test_helper'
|
2
|
-
require 'stringio'
|
3
|
-
describe DocParser::ScreenOutput do
|
4
|
-
before do
|
5
|
-
Log4r::Logger['docparser'].level = Log4r::ERROR
|
6
|
-
end
|
7
|
-
after do
|
8
|
-
Log4r::Logger['docparser'].level = Log4r::INFO
|
9
|
-
end
|
10
|
-
|
11
|
-
it 'should not create a file' do
|
12
|
-
Dir.mktmpdir do |dir|
|
13
|
-
filename = File.join(dir, '*')
|
14
|
-
DocParser::ScreenOutput.new
|
15
|
-
Dir[filename].must_be_empty
|
16
|
-
end
|
17
|
-
end
|
18
|
-
|
19
|
-
it 'must give the correct rowcount' do
|
20
|
-
output = DocParser::ScreenOutput.new
|
21
|
-
output.header = 'test', 'the', 'header'
|
22
|
-
output.rowcount.must_equal 0
|
23
|
-
output.add_row %w(aap noot mies)
|
24
|
-
output.add_row %w(aap noot mies)
|
25
|
-
output.rowcount.must_equal 2
|
26
|
-
end
|
27
|
-
|
28
|
-
it 'must have a header' do
|
29
|
-
output = DocParser::ScreenOutput.new
|
30
|
-
lambda do
|
31
|
-
output.add_row %w(aap noot mies)
|
32
|
-
end.must_raise(DocParser::MissingHeaderException)
|
33
|
-
end
|
34
|
-
|
35
|
-
it 'must output the data after close' do
|
36
|
-
$out = StringIO.new
|
37
|
-
output = Class.new DocParser::ScreenOutput do
|
38
|
-
def page(*args, &p)
|
39
|
-
args << p
|
40
|
-
args.compact!
|
41
|
-
page_to $out, args
|
42
|
-
end
|
43
|
-
end.new
|
44
|
-
output.header = 'test', 'the', 'header'
|
45
|
-
output.add_row ['aap1' , '', 'mies']
|
46
|
-
output.add_row %w(aap2 mies1)
|
47
|
-
output.close
|
48
|
-
out = $out.string
|
49
|
-
out.must_include 'header'
|
50
|
-
out.must_include 'aap1'
|
51
|
-
out.must_include 'mies1'
|
52
|
-
out.must_include 'mies'
|
53
|
-
end
|
54
|
-
end
|