docparser 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +1 -1
- data/Rakefile +9 -3
- data/docparser.gemspec +13 -12
- data/example.rb +0 -1
- data/lib/docparser/document.rb +4 -5
- data/lib/docparser/output.rb +2 -2
- data/lib/docparser/output/json_output.rb +2 -2
- data/lib/docparser/version.rb +1 -1
- data/test/.rubocop.yml +3 -0
- data/test/lib/docparser/document_test.rb +1 -1
- data/test/lib/docparser/output/csv_output_test.rb +3 -3
- data/test/lib/docparser/output/html_output_test.rb +5 -5
- data/test/lib/docparser/output/json_output_test.rb +10 -9
- data/test/lib/docparser/output/multi_output_test.rb +4 -4
- data/test/lib/docparser/output/nil_output_test.rb +2 -2
- data/test/lib/docparser/output/screen_output_test.rb +5 -6
- data/test/lib/docparser/output/xlsx_output_test.rb +3 -3
- data/test/lib/docparser/output/yaml_output_test.rb +6 -6
- data/test/lib/docparser/output_test.rb +1 -2
- data/test/lib/docparser/parser_test.rb +0 -2
- data/test/test_helper.rb +1 -1
- metadata +10 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 907927469491015367a9b5ba12ff4a8122495428
|
4
|
+
data.tar.gz: 5c842a24a58026c8296d61ca95d921f9ab20ccf9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 126b0563186b5f7dc9b94a55ee576d3f07818119056c99bd8dd938f940cb5c19b942cdb380ad9f2dc0367383b4e30cf42b8a2468cb9cad734f5cd716e92ce192
|
7
|
+
data.tar.gz: 7abef08de7561f3e8486141c311655bf8f13e1d4c6a658b9a9919c56f0d23fc48c071b6df14211f19f0d3987018d97b739065c01bb5fed267c38f3e86292071e
|
data/Gemfile
CHANGED
data/Rakefile
CHANGED
@@ -1,5 +1,8 @@
|
|
1
|
-
require
|
1
|
+
require 'bundler/gem_tasks'
|
2
2
|
require 'rake/testtask'
|
3
|
+
require 'rubocop'
|
4
|
+
require 'yard'
|
5
|
+
YARD::Rake::YardocTask.new
|
3
6
|
|
4
7
|
Rake::TestTask.new do |t|
|
5
8
|
t.libs << 'lib/docparser'
|
@@ -10,7 +13,10 @@ end
|
|
10
13
|
task test: :rubocop
|
11
14
|
|
12
15
|
task :rubocop do
|
13
|
-
|
16
|
+
puts "Running Rubocop #{Rubocop::Version::STRING}"
|
17
|
+
args = FileList['**/*.rb', 'Rakefile', 'docparser.gemspec']
|
18
|
+
cli = Rubocop::CLI.new
|
19
|
+
fail unless cli.run(args) == 0
|
14
20
|
end
|
15
21
|
|
16
|
-
task :
|
22
|
+
task default: :test
|
data/docparser.gemspec
CHANGED
@@ -3,27 +3,28 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
|
3
3
|
require 'docparser/version'
|
4
4
|
|
5
5
|
Gem::Specification.new do |spec|
|
6
|
-
spec.name =
|
6
|
+
spec.name = 'docparser'
|
7
7
|
spec.version = DocParser::VERSION
|
8
|
-
spec.authors = [
|
9
|
-
spec.email = [
|
10
|
-
spec.description =
|
11
|
-
spec.summary =
|
12
|
-
spec.homepage =
|
13
|
-
spec.license =
|
8
|
+
spec.authors = ['Jurriaan Pruis']
|
9
|
+
spec.email = ['email@jurriaanpruis.nl']
|
10
|
+
spec.description = 'DocParser is a Ruby Gem for webscraping'
|
11
|
+
spec.summary = 'DocParser is a Ruby Gem for webscraping'
|
12
|
+
spec.homepage = 'https://github.com/jurriaan/docparser'
|
13
|
+
spec.license = 'MIT'
|
14
14
|
spec.platform = Gem::Platform::RUBY
|
15
15
|
|
16
|
-
spec.files = `git ls-files`.split(
|
17
|
-
spec.executables = spec.files.grep(
|
18
|
-
spec.test_files = spec.files.grep(
|
19
|
-
spec.require_paths = [
|
16
|
+
spec.files = `git ls-files`.split($RS)
|
17
|
+
spec.executables = spec.files.grep(/^bin\//) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(/^(test|spec|features)\//)
|
19
|
+
spec.require_paths = ['lib']
|
20
|
+
spec.extra_rdoc_files = ['README.md', 'LICENSE']
|
20
21
|
|
21
22
|
spec.add_runtime_dependency 'nokogiri', '~> 1.5.9'
|
22
23
|
spec.add_runtime_dependency 'parallel', '~> 0.6.4'
|
23
24
|
spec.add_runtime_dependency 'axlsx', '~> 1.3.6'
|
24
25
|
spec.add_runtime_dependency 'terminal-table', '~> 1.4.5'
|
25
26
|
spec.add_runtime_dependency 'pageme', '~> 0.0.3'
|
26
|
-
spec.add_runtime_dependency '
|
27
|
+
spec.add_runtime_dependency 'multi_json', '~> 1.7'
|
27
28
|
spec.add_runtime_dependency 'log4r', '~> 1.1.10'
|
28
29
|
|
29
30
|
spec.add_development_dependency 'yard'
|
data/example.rb
CHANGED
data/lib/docparser/document.rb
CHANGED
@@ -5,6 +5,10 @@ module DocParser
|
|
5
5
|
# @see Output
|
6
6
|
class Document
|
7
7
|
attr_reader :filename, :doc, :encoding, :results
|
8
|
+
|
9
|
+
# @return [String] the source of the document
|
10
|
+
attr_reader :html
|
11
|
+
|
8
12
|
def initialize(filename: nil, encoding: 'utf-8', parser: nil)
|
9
13
|
if encoding == 'utf-8'
|
10
14
|
encodingstring = 'r:utf-8'
|
@@ -37,11 +41,6 @@ module DocParser
|
|
37
41
|
@title ||= xpath_content('//head/title')
|
38
42
|
end
|
39
43
|
|
40
|
-
# @return [String] the source of the document
|
41
|
-
def html
|
42
|
-
@html
|
43
|
-
end
|
44
|
-
|
45
44
|
# Executes a xpath query
|
46
45
|
def xpath(query)
|
47
46
|
res = @doc.search(query)
|
data/lib/docparser/output.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
require '
|
1
|
+
require 'multi_json'
|
2
2
|
module DocParser
|
3
3
|
# The JSONOutput class generates a JSON file containing all rows as seperate
|
4
4
|
# Array elements
|
@@ -25,7 +25,7 @@ module DocParser
|
|
25
25
|
@doc[@header[counter]] = ''
|
26
26
|
end
|
27
27
|
end
|
28
|
-
@file <<
|
28
|
+
@file << MultiJson.dump(@doc)
|
29
29
|
end
|
30
30
|
|
31
31
|
def footer
|
data/lib/docparser/version.rb
CHANGED
data/test/.rubocop.yml
ADDED
@@ -110,7 +110,7 @@ describe DocParser::Document do
|
|
110
110
|
it 'should add the row to the results' do
|
111
111
|
@test_doc.add_row ['test']
|
112
112
|
@test_doc.add_row 'test', 'test2'
|
113
|
-
@test_doc.results.must_equal [[
|
113
|
+
@test_doc.results.must_equal [[%w(test), %w(test test2)]]
|
114
114
|
end
|
115
115
|
|
116
116
|
it 'should be possible to not use outputs' do
|
@@ -30,7 +30,7 @@ describe DocParser::CSVOutput do
|
|
30
30
|
Dir.mktmpdir do |dir|
|
31
31
|
filename = File.join(dir, 'test.csv')
|
32
32
|
output = DocParser::CSVOutput.new(filename: filename)
|
33
|
-
output.add_row
|
33
|
+
output.add_row %w(aap noot mies)
|
34
34
|
output.add_row ['aap', 'noot', 'mies;']
|
35
35
|
output.close
|
36
36
|
open(filename).read.must_equal "aap;noot;mies\naap;noot;\"mies;\"\n"
|
@@ -43,8 +43,8 @@ describe DocParser::CSVOutput do
|
|
43
43
|
output = DocParser::CSVOutput.new(filename: filename)
|
44
44
|
output.header = 'test', 'the', 'header'
|
45
45
|
output.rowcount.must_equal 0
|
46
|
-
output.add_row
|
47
|
-
output.add_row
|
46
|
+
output.add_row %w(aap noot mies)
|
47
|
+
output.add_row %w(aap noot mies)
|
48
48
|
output.rowcount.must_equal 2
|
49
49
|
end
|
50
50
|
end
|
@@ -22,8 +22,8 @@ describe DocParser::HTMLOutput do
|
|
22
22
|
output = DocParser::HTMLOutput.new(filename: filename)
|
23
23
|
output.header = 'test', 'the', 'header'
|
24
24
|
output.close
|
25
|
-
open(filename).read.must_include '<thead><tr><th>test</th><th>the</th>
|
26
|
-
|
25
|
+
open(filename).read.must_include '<thead><tr><th>test</th><th>the</th>
|
26
|
+
<th>header</th></tr></thead>'.gsub(/\s+/, '')
|
27
27
|
end
|
28
28
|
end
|
29
29
|
|
@@ -31,7 +31,7 @@ describe DocParser::HTMLOutput do
|
|
31
31
|
Dir.mktmpdir do |dir|
|
32
32
|
filename = File.join(dir, 'test.html')
|
33
33
|
output = DocParser::HTMLOutput.new(filename: filename)
|
34
|
-
output.add_row
|
34
|
+
output.add_row %w(aap noot mies)
|
35
35
|
output.add_row ['aap', 'noot', 'mies;']
|
36
36
|
output.close
|
37
37
|
html = open(filename).read
|
@@ -47,8 +47,8 @@ describe DocParser::HTMLOutput do
|
|
47
47
|
output = DocParser::HTMLOutput.new(filename: filename)
|
48
48
|
output.header = 'test', 'the', 'header'
|
49
49
|
output.rowcount.must_equal 0
|
50
|
-
output.add_row
|
51
|
-
output.add_row
|
50
|
+
output.add_row %w(aap noot mies)
|
51
|
+
output.add_row %w(aap noot mies)
|
52
52
|
output.rowcount.must_equal 2
|
53
53
|
output.close
|
54
54
|
open(filename).read.must_include('<p>2 rows</p>')
|
@@ -31,7 +31,7 @@ describe DocParser::JSONOutput do
|
|
31
31
|
filename = File.join(dir, 'test.json')
|
32
32
|
output = DocParser::JSONOutput.new(filename: filename)
|
33
33
|
-> do
|
34
|
-
output.add_row
|
34
|
+
output.add_row %w(aap noot mies)
|
35
35
|
end.must_raise(DocParser::MissingHeaderException)
|
36
36
|
end
|
37
37
|
end
|
@@ -41,13 +41,14 @@ describe DocParser::JSONOutput do
|
|
41
41
|
filename = File.join(dir, 'test.json')
|
42
42
|
output = DocParser::JSONOutput.new(filename: filename)
|
43
43
|
output.header = 'test', 'the', 'header'
|
44
|
-
output.add_row
|
45
|
-
output.add_row
|
46
|
-
output.add_row
|
44
|
+
output.add_row %w(a b c)
|
45
|
+
output.add_row %w(aap noot mies")
|
46
|
+
output.add_row %w(aap noot) # testing empty column
|
47
47
|
output.close
|
48
|
-
|
49
|
-
|
50
|
-
|
48
|
+
expected = '[{"test":"a","the":"b","header":"c"},
|
49
|
+
{"test":"aap","the":"noot","header":"mies\""},
|
50
|
+
{"test":"aap","the":"noot","header":""}]'.gsub(/\s+/, '')
|
51
|
+
open(filename).read.must_equal expected
|
51
52
|
end
|
52
53
|
end
|
53
54
|
|
@@ -57,8 +58,8 @@ describe DocParser::JSONOutput do
|
|
57
58
|
output = DocParser::JSONOutput.new(filename: filename)
|
58
59
|
output.header = 'test', 'the', 'header'
|
59
60
|
output.rowcount.must_equal 0
|
60
|
-
output.add_row
|
61
|
-
output.add_row
|
61
|
+
output.add_row %w(aap noot mies)
|
62
|
+
output.add_row %w(aap noot mies)
|
62
63
|
output.rowcount.must_equal 2
|
63
64
|
end
|
64
65
|
end
|
@@ -36,7 +36,7 @@ describe DocParser::MultiOutput do
|
|
36
36
|
filename = File.join(dir, 'test')
|
37
37
|
output = DocParser::MultiOutput.new(filename: filename)
|
38
38
|
-> do
|
39
|
-
output.add_row
|
39
|
+
output.add_row %w(aap noot mies)
|
40
40
|
end.must_raise(DocParser::MissingHeaderException)
|
41
41
|
end
|
42
42
|
end
|
@@ -47,8 +47,8 @@ describe DocParser::MultiOutput do
|
|
47
47
|
output = DocParser::MultiOutput.new(filename: filename)
|
48
48
|
output.header = 'test', 'the', 'header'
|
49
49
|
output.rowcount.must_equal 0
|
50
|
-
output.add_row
|
51
|
-
output.add_row
|
50
|
+
output.add_row %w(aap noot mies)
|
51
|
+
output.add_row %w(aap noot mies)
|
52
52
|
output.rowcount.must_equal 2
|
53
53
|
end
|
54
54
|
end
|
@@ -57,7 +57,7 @@ describe DocParser::MultiOutput do
|
|
57
57
|
Dir.mktmpdir do |dir|
|
58
58
|
filename = File.join(dir, 'test')
|
59
59
|
output = DocParser::MultiOutput.new(filename: filename)
|
60
|
-
methods =
|
60
|
+
methods = %i(add_row header= close)
|
61
61
|
outputs = output.instance_variable_get(:@outputs)
|
62
62
|
outputs.map! do |o|
|
63
63
|
SimpleMock.new o
|
@@ -20,8 +20,8 @@ describe DocParser::NilOutput do
|
|
20
20
|
output = DocParser::NilOutput.new
|
21
21
|
output.header = 'test', 'the', 'header'
|
22
22
|
output.rowcount.must_equal 0
|
23
|
-
output.add_row
|
24
|
-
output.add_row
|
23
|
+
output.add_row %w(aap noot mies)
|
24
|
+
output.add_row %w(aap noot mies)
|
25
25
|
output.rowcount.must_equal 0
|
26
26
|
end
|
27
27
|
end
|
@@ -20,19 +20,18 @@ describe DocParser::ScreenOutput do
|
|
20
20
|
output = DocParser::ScreenOutput.new
|
21
21
|
output.header = 'test', 'the', 'header'
|
22
22
|
output.rowcount.must_equal 0
|
23
|
-
output.add_row
|
24
|
-
output.add_row
|
23
|
+
output.add_row %w(aap noot mies)
|
24
|
+
output.add_row %w(aap noot mies)
|
25
25
|
output.rowcount.must_equal 2
|
26
26
|
end
|
27
27
|
|
28
28
|
it 'must have a header' do
|
29
29
|
output = DocParser::ScreenOutput.new
|
30
30
|
-> do
|
31
|
-
output.add_row
|
31
|
+
output.add_row %w(aap noot mies)
|
32
32
|
end.must_raise(DocParser::MissingHeaderException)
|
33
33
|
end
|
34
34
|
|
35
|
-
|
36
35
|
it 'must output the data after close' do
|
37
36
|
$out = StringIO.new
|
38
37
|
output = Class.new DocParser::ScreenOutput do
|
@@ -43,8 +42,8 @@ describe DocParser::ScreenOutput do
|
|
43
42
|
end
|
44
43
|
end.new
|
45
44
|
output.header = 'test', 'the', 'header'
|
46
|
-
output.add_row ['aap1', '', 'mies']
|
47
|
-
output.add_row
|
45
|
+
output.add_row ['aap1' , '', 'mies']
|
46
|
+
output.add_row %w(aap2 mies1)
|
48
47
|
output.close
|
49
48
|
out = $out.string
|
50
49
|
out.must_include 'header'
|
@@ -31,7 +31,7 @@ describe DocParser::XLSXOutput do
|
|
31
31
|
Dir.mktmpdir do |dir|
|
32
32
|
filename = File.join(dir, 'test.xlsx')
|
33
33
|
output = DocParser::XLSXOutput.new(filename: filename)
|
34
|
-
output.add_row
|
34
|
+
output.add_row %w(aap noot mies)
|
35
35
|
output.add_row ['aap', 'noot', 'mies;']
|
36
36
|
output.close
|
37
37
|
sheet = output.instance_variable_get(:@sheet)
|
@@ -45,8 +45,8 @@ describe DocParser::XLSXOutput do
|
|
45
45
|
output = DocParser::XLSXOutput.new(filename: filename)
|
46
46
|
output.header = 'test', 'the', 'header'
|
47
47
|
output.rowcount.must_equal 0
|
48
|
-
output.add_row
|
49
|
-
output.add_row
|
48
|
+
output.add_row %w(aap noot mies)
|
49
|
+
output.add_row %w(aap noot mies)
|
50
50
|
output.rowcount.must_equal 2
|
51
51
|
end
|
52
52
|
end
|
@@ -31,7 +31,7 @@ describe DocParser::YAMLOutput do
|
|
31
31
|
filename = File.join(dir, 'test.yml')
|
32
32
|
output = DocParser::YAMLOutput.new(filename: filename)
|
33
33
|
-> do
|
34
|
-
output.add_row
|
34
|
+
output.add_row %w(aap noot mies)
|
35
35
|
end.must_raise(DocParser::MissingHeaderException)
|
36
36
|
end
|
37
37
|
end
|
@@ -41,9 +41,9 @@ describe DocParser::YAMLOutput do
|
|
41
41
|
filename = File.join(dir, 'test.csv')
|
42
42
|
output = DocParser::YAMLOutput.new(filename: filename)
|
43
43
|
output.header = 'test', 'the', 'header'
|
44
|
-
output.add_row
|
45
|
-
output.add_row
|
46
|
-
output.add_row
|
44
|
+
output.add_row %w(a b c)
|
45
|
+
output.add_row %w(aap noot mies")
|
46
|
+
output.add_row %w(aap noot) # testing empty column
|
47
47
|
output.close
|
48
48
|
open(filename).read.must_equal <<-YAMLEND
|
49
49
|
---
|
@@ -68,8 +68,8 @@ YAMLEND
|
|
68
68
|
output = DocParser::YAMLOutput.new(filename: filename)
|
69
69
|
output.header = 'test', 'the', 'header'
|
70
70
|
output.rowcount.must_equal 0
|
71
|
-
output.add_row
|
72
|
-
output.add_row
|
71
|
+
output.add_row %w(aap noot mies)
|
72
|
+
output.add_row %w(aap noot mies)
|
73
73
|
output.rowcount.must_equal 2
|
74
74
|
end
|
75
75
|
end
|
@@ -32,7 +32,7 @@ describe DocParser::Output do
|
|
32
32
|
output.header = 'test', 'the', 'header'
|
33
33
|
end
|
34
34
|
header = output.instance_variable_get(:@header)
|
35
|
-
header.must_equal
|
35
|
+
header.must_equal %w(test the header)
|
36
36
|
$method_id.must_equal :header
|
37
37
|
end
|
38
38
|
end
|
@@ -75,7 +75,6 @@ describe DocParser::Output do
|
|
75
75
|
end
|
76
76
|
|
77
77
|
it 'should raise a NotImplementedError on write_row' do
|
78
|
-
|
79
78
|
Dir.mktmpdir do |dir|
|
80
79
|
filename = File.join(dir, 'test.csv')
|
81
80
|
output = DocParser::Output.new(filename: filename)
|
@@ -162,7 +162,6 @@ describe DocParser::Parser do
|
|
162
162
|
mock_output2.verify.must_equal true
|
163
163
|
end
|
164
164
|
|
165
|
-
|
166
165
|
it 'should support parallel processing' do
|
167
166
|
mock_output = SimpleMock.new DocParser::NilOutput.new
|
168
167
|
mock_output.expect :close, nil
|
@@ -193,5 +192,4 @@ describe DocParser::Parser do
|
|
193
192
|
$method_id.must_equal :fork
|
194
193
|
mock_output.verify.must_equal true
|
195
194
|
end
|
196
|
-
|
197
195
|
end
|
data/test/test_helper.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: docparser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jurriaan Pruis
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-
|
11
|
+
date: 2013-05-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -81,19 +81,19 @@ dependencies:
|
|
81
81
|
- !ruby/object:Gem::Version
|
82
82
|
version: 0.0.3
|
83
83
|
- !ruby/object:Gem::Dependency
|
84
|
-
name:
|
84
|
+
name: multi_json
|
85
85
|
requirement: !ruby/object:Gem::Requirement
|
86
86
|
requirements:
|
87
87
|
- - ~>
|
88
88
|
- !ruby/object:Gem::Version
|
89
|
-
version: 1.7
|
89
|
+
version: '1.7'
|
90
90
|
type: :runtime
|
91
91
|
prerelease: false
|
92
92
|
version_requirements: !ruby/object:Gem::Requirement
|
93
93
|
requirements:
|
94
94
|
- - ~>
|
95
95
|
- !ruby/object:Gem::Version
|
96
|
-
version: 1.7
|
96
|
+
version: '1.7'
|
97
97
|
- !ruby/object:Gem::Dependency
|
98
98
|
name: log4r
|
99
99
|
requirement: !ruby/object:Gem::Requirement
|
@@ -127,7 +127,9 @@ email:
|
|
127
127
|
- email@jurriaanpruis.nl
|
128
128
|
executables: []
|
129
129
|
extensions: []
|
130
|
-
extra_rdoc_files:
|
130
|
+
extra_rdoc_files:
|
131
|
+
- README.md
|
132
|
+
- LICENSE
|
131
133
|
files:
|
132
134
|
- .coveralls.yml
|
133
135
|
- .gitignore
|
@@ -153,6 +155,7 @@ files:
|
|
153
155
|
- lib/docparser/output/yaml_output.rb
|
154
156
|
- lib/docparser/parser.rb
|
155
157
|
- lib/docparser/version.rb
|
158
|
+
- test/.rubocop.yml
|
156
159
|
- test/lib/docparser/blackbox_test.rb
|
157
160
|
- test/lib/docparser/document_test.rb
|
158
161
|
- test/lib/docparser/logging_test.rb
|
@@ -238,6 +241,7 @@ signing_key:
|
|
238
241
|
specification_version: 4
|
239
242
|
summary: DocParser is a Ruby Gem for webscraping
|
240
243
|
test_files:
|
244
|
+
- test/.rubocop.yml
|
241
245
|
- test/lib/docparser/blackbox_test.rb
|
242
246
|
- test/lib/docparser/document_test.rb
|
243
247
|
- test/lib/docparser/logging_test.rb
|