docparser 0.1.2 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +1 -1
- data/Rakefile +9 -3
- data/docparser.gemspec +13 -12
- data/example.rb +0 -1
- data/lib/docparser/document.rb +4 -5
- data/lib/docparser/output.rb +2 -2
- data/lib/docparser/output/json_output.rb +2 -2
- data/lib/docparser/version.rb +1 -1
- data/test/.rubocop.yml +3 -0
- data/test/lib/docparser/document_test.rb +1 -1
- data/test/lib/docparser/output/csv_output_test.rb +3 -3
- data/test/lib/docparser/output/html_output_test.rb +5 -5
- data/test/lib/docparser/output/json_output_test.rb +10 -9
- data/test/lib/docparser/output/multi_output_test.rb +4 -4
- data/test/lib/docparser/output/nil_output_test.rb +2 -2
- data/test/lib/docparser/output/screen_output_test.rb +5 -6
- data/test/lib/docparser/output/xlsx_output_test.rb +3 -3
- data/test/lib/docparser/output/yaml_output_test.rb +6 -6
- data/test/lib/docparser/output_test.rb +1 -2
- data/test/lib/docparser/parser_test.rb +0 -2
- data/test/test_helper.rb +1 -1
- metadata +10 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 907927469491015367a9b5ba12ff4a8122495428
|
4
|
+
data.tar.gz: 5c842a24a58026c8296d61ca95d921f9ab20ccf9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 126b0563186b5f7dc9b94a55ee576d3f07818119056c99bd8dd938f940cb5c19b942cdb380ad9f2dc0367383b4e30cf42b8a2468cb9cad734f5cd716e92ce192
|
7
|
+
data.tar.gz: 7abef08de7561f3e8486141c311655bf8f13e1d4c6a658b9a9919c56f0d23fc48c071b6df14211f19f0d3987018d97b739065c01bb5fed267c38f3e86292071e
|
data/Gemfile
CHANGED
data/Rakefile
CHANGED
@@ -1,5 +1,8 @@
|
|
1
|
-
require
|
1
|
+
require 'bundler/gem_tasks'
|
2
2
|
require 'rake/testtask'
|
3
|
+
require 'rubocop'
|
4
|
+
require 'yard'
|
5
|
+
YARD::Rake::YardocTask.new
|
3
6
|
|
4
7
|
Rake::TestTask.new do |t|
|
5
8
|
t.libs << 'lib/docparser'
|
@@ -10,7 +13,10 @@ end
|
|
10
13
|
task test: :rubocop
|
11
14
|
|
12
15
|
task :rubocop do
|
13
|
-
|
16
|
+
puts "Running Rubocop #{Rubocop::Version::STRING}"
|
17
|
+
args = FileList['**/*.rb', 'Rakefile', 'docparser.gemspec']
|
18
|
+
cli = Rubocop::CLI.new
|
19
|
+
fail unless cli.run(args) == 0
|
14
20
|
end
|
15
21
|
|
16
|
-
task :
|
22
|
+
task default: :test
|
data/docparser.gemspec
CHANGED
@@ -3,27 +3,28 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
|
3
3
|
require 'docparser/version'
|
4
4
|
|
5
5
|
Gem::Specification.new do |spec|
|
6
|
-
spec.name =
|
6
|
+
spec.name = 'docparser'
|
7
7
|
spec.version = DocParser::VERSION
|
8
|
-
spec.authors = [
|
9
|
-
spec.email = [
|
10
|
-
spec.description =
|
11
|
-
spec.summary =
|
12
|
-
spec.homepage =
|
13
|
-
spec.license =
|
8
|
+
spec.authors = ['Jurriaan Pruis']
|
9
|
+
spec.email = ['email@jurriaanpruis.nl']
|
10
|
+
spec.description = 'DocParser is a Ruby Gem for webscraping'
|
11
|
+
spec.summary = 'DocParser is a Ruby Gem for webscraping'
|
12
|
+
spec.homepage = 'https://github.com/jurriaan/docparser'
|
13
|
+
spec.license = 'MIT'
|
14
14
|
spec.platform = Gem::Platform::RUBY
|
15
15
|
|
16
|
-
spec.files = `git ls-files`.split(
|
17
|
-
spec.executables = spec.files.grep(
|
18
|
-
spec.test_files = spec.files.grep(
|
19
|
-
spec.require_paths = [
|
16
|
+
spec.files = `git ls-files`.split($RS)
|
17
|
+
spec.executables = spec.files.grep(/^bin\//) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(/^(test|spec|features)\//)
|
19
|
+
spec.require_paths = ['lib']
|
20
|
+
spec.extra_rdoc_files = ['README.md', 'LICENSE']
|
20
21
|
|
21
22
|
spec.add_runtime_dependency 'nokogiri', '~> 1.5.9'
|
22
23
|
spec.add_runtime_dependency 'parallel', '~> 0.6.4'
|
23
24
|
spec.add_runtime_dependency 'axlsx', '~> 1.3.6'
|
24
25
|
spec.add_runtime_dependency 'terminal-table', '~> 1.4.5'
|
25
26
|
spec.add_runtime_dependency 'pageme', '~> 0.0.3'
|
26
|
-
spec.add_runtime_dependency '
|
27
|
+
spec.add_runtime_dependency 'multi_json', '~> 1.7'
|
27
28
|
spec.add_runtime_dependency 'log4r', '~> 1.1.10'
|
28
29
|
|
29
30
|
spec.add_development_dependency 'yard'
|
data/example.rb
CHANGED
data/lib/docparser/document.rb
CHANGED
@@ -5,6 +5,10 @@ module DocParser
|
|
5
5
|
# @see Output
|
6
6
|
class Document
|
7
7
|
attr_reader :filename, :doc, :encoding, :results
|
8
|
+
|
9
|
+
# @return [String] the source of the document
|
10
|
+
attr_reader :html
|
11
|
+
|
8
12
|
def initialize(filename: nil, encoding: 'utf-8', parser: nil)
|
9
13
|
if encoding == 'utf-8'
|
10
14
|
encodingstring = 'r:utf-8'
|
@@ -37,11 +41,6 @@ module DocParser
|
|
37
41
|
@title ||= xpath_content('//head/title')
|
38
42
|
end
|
39
43
|
|
40
|
-
# @return [String] the source of the document
|
41
|
-
def html
|
42
|
-
@html
|
43
|
-
end
|
44
|
-
|
45
44
|
# Executes a xpath query
|
46
45
|
def xpath(query)
|
47
46
|
res = @doc.search(query)
|
data/lib/docparser/output.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
require '
|
1
|
+
require 'multi_json'
|
2
2
|
module DocParser
|
3
3
|
# The JSONOutput class generates a JSON file containing all rows as seperate
|
4
4
|
# Array elements
|
@@ -25,7 +25,7 @@ module DocParser
|
|
25
25
|
@doc[@header[counter]] = ''
|
26
26
|
end
|
27
27
|
end
|
28
|
-
@file <<
|
28
|
+
@file << MultiJson.dump(@doc)
|
29
29
|
end
|
30
30
|
|
31
31
|
def footer
|
data/lib/docparser/version.rb
CHANGED
data/test/.rubocop.yml
ADDED
@@ -110,7 +110,7 @@ describe DocParser::Document do
|
|
110
110
|
it 'should add the row to the results' do
|
111
111
|
@test_doc.add_row ['test']
|
112
112
|
@test_doc.add_row 'test', 'test2'
|
113
|
-
@test_doc.results.must_equal [[
|
113
|
+
@test_doc.results.must_equal [[%w(test), %w(test test2)]]
|
114
114
|
end
|
115
115
|
|
116
116
|
it 'should be possible to not use outputs' do
|
@@ -30,7 +30,7 @@ describe DocParser::CSVOutput do
|
|
30
30
|
Dir.mktmpdir do |dir|
|
31
31
|
filename = File.join(dir, 'test.csv')
|
32
32
|
output = DocParser::CSVOutput.new(filename: filename)
|
33
|
-
output.add_row
|
33
|
+
output.add_row %w(aap noot mies)
|
34
34
|
output.add_row ['aap', 'noot', 'mies;']
|
35
35
|
output.close
|
36
36
|
open(filename).read.must_equal "aap;noot;mies\naap;noot;\"mies;\"\n"
|
@@ -43,8 +43,8 @@ describe DocParser::CSVOutput do
|
|
43
43
|
output = DocParser::CSVOutput.new(filename: filename)
|
44
44
|
output.header = 'test', 'the', 'header'
|
45
45
|
output.rowcount.must_equal 0
|
46
|
-
output.add_row
|
47
|
-
output.add_row
|
46
|
+
output.add_row %w(aap noot mies)
|
47
|
+
output.add_row %w(aap noot mies)
|
48
48
|
output.rowcount.must_equal 2
|
49
49
|
end
|
50
50
|
end
|
@@ -22,8 +22,8 @@ describe DocParser::HTMLOutput do
|
|
22
22
|
output = DocParser::HTMLOutput.new(filename: filename)
|
23
23
|
output.header = 'test', 'the', 'header'
|
24
24
|
output.close
|
25
|
-
open(filename).read.must_include '<thead><tr><th>test</th><th>the</th>
|
26
|
-
|
25
|
+
open(filename).read.must_include '<thead><tr><th>test</th><th>the</th>
|
26
|
+
<th>header</th></tr></thead>'.gsub(/\s+/, '')
|
27
27
|
end
|
28
28
|
end
|
29
29
|
|
@@ -31,7 +31,7 @@ describe DocParser::HTMLOutput do
|
|
31
31
|
Dir.mktmpdir do |dir|
|
32
32
|
filename = File.join(dir, 'test.html')
|
33
33
|
output = DocParser::HTMLOutput.new(filename: filename)
|
34
|
-
output.add_row
|
34
|
+
output.add_row %w(aap noot mies)
|
35
35
|
output.add_row ['aap', 'noot', 'mies;']
|
36
36
|
output.close
|
37
37
|
html = open(filename).read
|
@@ -47,8 +47,8 @@ describe DocParser::HTMLOutput do
|
|
47
47
|
output = DocParser::HTMLOutput.new(filename: filename)
|
48
48
|
output.header = 'test', 'the', 'header'
|
49
49
|
output.rowcount.must_equal 0
|
50
|
-
output.add_row
|
51
|
-
output.add_row
|
50
|
+
output.add_row %w(aap noot mies)
|
51
|
+
output.add_row %w(aap noot mies)
|
52
52
|
output.rowcount.must_equal 2
|
53
53
|
output.close
|
54
54
|
open(filename).read.must_include('<p>2 rows</p>')
|
@@ -31,7 +31,7 @@ describe DocParser::JSONOutput do
|
|
31
31
|
filename = File.join(dir, 'test.json')
|
32
32
|
output = DocParser::JSONOutput.new(filename: filename)
|
33
33
|
-> do
|
34
|
-
output.add_row
|
34
|
+
output.add_row %w(aap noot mies)
|
35
35
|
end.must_raise(DocParser::MissingHeaderException)
|
36
36
|
end
|
37
37
|
end
|
@@ -41,13 +41,14 @@ describe DocParser::JSONOutput do
|
|
41
41
|
filename = File.join(dir, 'test.json')
|
42
42
|
output = DocParser::JSONOutput.new(filename: filename)
|
43
43
|
output.header = 'test', 'the', 'header'
|
44
|
-
output.add_row
|
45
|
-
output.add_row
|
46
|
-
output.add_row
|
44
|
+
output.add_row %w(a b c)
|
45
|
+
output.add_row %w(aap noot mies")
|
46
|
+
output.add_row %w(aap noot) # testing empty column
|
47
47
|
output.close
|
48
|
-
|
49
|
-
|
50
|
-
|
48
|
+
expected = '[{"test":"a","the":"b","header":"c"},
|
49
|
+
{"test":"aap","the":"noot","header":"mies\""},
|
50
|
+
{"test":"aap","the":"noot","header":""}]'.gsub(/\s+/, '')
|
51
|
+
open(filename).read.must_equal expected
|
51
52
|
end
|
52
53
|
end
|
53
54
|
|
@@ -57,8 +58,8 @@ describe DocParser::JSONOutput do
|
|
57
58
|
output = DocParser::JSONOutput.new(filename: filename)
|
58
59
|
output.header = 'test', 'the', 'header'
|
59
60
|
output.rowcount.must_equal 0
|
60
|
-
output.add_row
|
61
|
-
output.add_row
|
61
|
+
output.add_row %w(aap noot mies)
|
62
|
+
output.add_row %w(aap noot mies)
|
62
63
|
output.rowcount.must_equal 2
|
63
64
|
end
|
64
65
|
end
|
@@ -36,7 +36,7 @@ describe DocParser::MultiOutput do
|
|
36
36
|
filename = File.join(dir, 'test')
|
37
37
|
output = DocParser::MultiOutput.new(filename: filename)
|
38
38
|
-> do
|
39
|
-
output.add_row
|
39
|
+
output.add_row %w(aap noot mies)
|
40
40
|
end.must_raise(DocParser::MissingHeaderException)
|
41
41
|
end
|
42
42
|
end
|
@@ -47,8 +47,8 @@ describe DocParser::MultiOutput do
|
|
47
47
|
output = DocParser::MultiOutput.new(filename: filename)
|
48
48
|
output.header = 'test', 'the', 'header'
|
49
49
|
output.rowcount.must_equal 0
|
50
|
-
output.add_row
|
51
|
-
output.add_row
|
50
|
+
output.add_row %w(aap noot mies)
|
51
|
+
output.add_row %w(aap noot mies)
|
52
52
|
output.rowcount.must_equal 2
|
53
53
|
end
|
54
54
|
end
|
@@ -57,7 +57,7 @@ describe DocParser::MultiOutput do
|
|
57
57
|
Dir.mktmpdir do |dir|
|
58
58
|
filename = File.join(dir, 'test')
|
59
59
|
output = DocParser::MultiOutput.new(filename: filename)
|
60
|
-
methods =
|
60
|
+
methods = %i(add_row header= close)
|
61
61
|
outputs = output.instance_variable_get(:@outputs)
|
62
62
|
outputs.map! do |o|
|
63
63
|
SimpleMock.new o
|
@@ -20,8 +20,8 @@ describe DocParser::NilOutput do
|
|
20
20
|
output = DocParser::NilOutput.new
|
21
21
|
output.header = 'test', 'the', 'header'
|
22
22
|
output.rowcount.must_equal 0
|
23
|
-
output.add_row
|
24
|
-
output.add_row
|
23
|
+
output.add_row %w(aap noot mies)
|
24
|
+
output.add_row %w(aap noot mies)
|
25
25
|
output.rowcount.must_equal 0
|
26
26
|
end
|
27
27
|
end
|
@@ -20,19 +20,18 @@ describe DocParser::ScreenOutput do
|
|
20
20
|
output = DocParser::ScreenOutput.new
|
21
21
|
output.header = 'test', 'the', 'header'
|
22
22
|
output.rowcount.must_equal 0
|
23
|
-
output.add_row
|
24
|
-
output.add_row
|
23
|
+
output.add_row %w(aap noot mies)
|
24
|
+
output.add_row %w(aap noot mies)
|
25
25
|
output.rowcount.must_equal 2
|
26
26
|
end
|
27
27
|
|
28
28
|
it 'must have a header' do
|
29
29
|
output = DocParser::ScreenOutput.new
|
30
30
|
-> do
|
31
|
-
output.add_row
|
31
|
+
output.add_row %w(aap noot mies)
|
32
32
|
end.must_raise(DocParser::MissingHeaderException)
|
33
33
|
end
|
34
34
|
|
35
|
-
|
36
35
|
it 'must output the data after close' do
|
37
36
|
$out = StringIO.new
|
38
37
|
output = Class.new DocParser::ScreenOutput do
|
@@ -43,8 +42,8 @@ describe DocParser::ScreenOutput do
|
|
43
42
|
end
|
44
43
|
end.new
|
45
44
|
output.header = 'test', 'the', 'header'
|
46
|
-
output.add_row ['aap1', '', 'mies']
|
47
|
-
output.add_row
|
45
|
+
output.add_row ['aap1' , '', 'mies']
|
46
|
+
output.add_row %w(aap2 mies1)
|
48
47
|
output.close
|
49
48
|
out = $out.string
|
50
49
|
out.must_include 'header'
|
@@ -31,7 +31,7 @@ describe DocParser::XLSXOutput do
|
|
31
31
|
Dir.mktmpdir do |dir|
|
32
32
|
filename = File.join(dir, 'test.xlsx')
|
33
33
|
output = DocParser::XLSXOutput.new(filename: filename)
|
34
|
-
output.add_row
|
34
|
+
output.add_row %w(aap noot mies)
|
35
35
|
output.add_row ['aap', 'noot', 'mies;']
|
36
36
|
output.close
|
37
37
|
sheet = output.instance_variable_get(:@sheet)
|
@@ -45,8 +45,8 @@ describe DocParser::XLSXOutput do
|
|
45
45
|
output = DocParser::XLSXOutput.new(filename: filename)
|
46
46
|
output.header = 'test', 'the', 'header'
|
47
47
|
output.rowcount.must_equal 0
|
48
|
-
output.add_row
|
49
|
-
output.add_row
|
48
|
+
output.add_row %w(aap noot mies)
|
49
|
+
output.add_row %w(aap noot mies)
|
50
50
|
output.rowcount.must_equal 2
|
51
51
|
end
|
52
52
|
end
|
@@ -31,7 +31,7 @@ describe DocParser::YAMLOutput do
|
|
31
31
|
filename = File.join(dir, 'test.yml')
|
32
32
|
output = DocParser::YAMLOutput.new(filename: filename)
|
33
33
|
-> do
|
34
|
-
output.add_row
|
34
|
+
output.add_row %w(aap noot mies)
|
35
35
|
end.must_raise(DocParser::MissingHeaderException)
|
36
36
|
end
|
37
37
|
end
|
@@ -41,9 +41,9 @@ describe DocParser::YAMLOutput do
|
|
41
41
|
filename = File.join(dir, 'test.csv')
|
42
42
|
output = DocParser::YAMLOutput.new(filename: filename)
|
43
43
|
output.header = 'test', 'the', 'header'
|
44
|
-
output.add_row
|
45
|
-
output.add_row
|
46
|
-
output.add_row
|
44
|
+
output.add_row %w(a b c)
|
45
|
+
output.add_row %w(aap noot mies")
|
46
|
+
output.add_row %w(aap noot) # testing empty column
|
47
47
|
output.close
|
48
48
|
open(filename).read.must_equal <<-YAMLEND
|
49
49
|
---
|
@@ -68,8 +68,8 @@ YAMLEND
|
|
68
68
|
output = DocParser::YAMLOutput.new(filename: filename)
|
69
69
|
output.header = 'test', 'the', 'header'
|
70
70
|
output.rowcount.must_equal 0
|
71
|
-
output.add_row
|
72
|
-
output.add_row
|
71
|
+
output.add_row %w(aap noot mies)
|
72
|
+
output.add_row %w(aap noot mies)
|
73
73
|
output.rowcount.must_equal 2
|
74
74
|
end
|
75
75
|
end
|
@@ -32,7 +32,7 @@ describe DocParser::Output do
|
|
32
32
|
output.header = 'test', 'the', 'header'
|
33
33
|
end
|
34
34
|
header = output.instance_variable_get(:@header)
|
35
|
-
header.must_equal
|
35
|
+
header.must_equal %w(test the header)
|
36
36
|
$method_id.must_equal :header
|
37
37
|
end
|
38
38
|
end
|
@@ -75,7 +75,6 @@ describe DocParser::Output do
|
|
75
75
|
end
|
76
76
|
|
77
77
|
it 'should raise a NotImplementedError on write_row' do
|
78
|
-
|
79
78
|
Dir.mktmpdir do |dir|
|
80
79
|
filename = File.join(dir, 'test.csv')
|
81
80
|
output = DocParser::Output.new(filename: filename)
|
@@ -162,7 +162,6 @@ describe DocParser::Parser do
|
|
162
162
|
mock_output2.verify.must_equal true
|
163
163
|
end
|
164
164
|
|
165
|
-
|
166
165
|
it 'should support parallel processing' do
|
167
166
|
mock_output = SimpleMock.new DocParser::NilOutput.new
|
168
167
|
mock_output.expect :close, nil
|
@@ -193,5 +192,4 @@ describe DocParser::Parser do
|
|
193
192
|
$method_id.must_equal :fork
|
194
193
|
mock_output.verify.must_equal true
|
195
194
|
end
|
196
|
-
|
197
195
|
end
|
data/test/test_helper.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: docparser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jurriaan Pruis
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-
|
11
|
+
date: 2013-05-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -81,19 +81,19 @@ dependencies:
|
|
81
81
|
- !ruby/object:Gem::Version
|
82
82
|
version: 0.0.3
|
83
83
|
- !ruby/object:Gem::Dependency
|
84
|
-
name:
|
84
|
+
name: multi_json
|
85
85
|
requirement: !ruby/object:Gem::Requirement
|
86
86
|
requirements:
|
87
87
|
- - ~>
|
88
88
|
- !ruby/object:Gem::Version
|
89
|
-
version: 1.7
|
89
|
+
version: '1.7'
|
90
90
|
type: :runtime
|
91
91
|
prerelease: false
|
92
92
|
version_requirements: !ruby/object:Gem::Requirement
|
93
93
|
requirements:
|
94
94
|
- - ~>
|
95
95
|
- !ruby/object:Gem::Version
|
96
|
-
version: 1.7
|
96
|
+
version: '1.7'
|
97
97
|
- !ruby/object:Gem::Dependency
|
98
98
|
name: log4r
|
99
99
|
requirement: !ruby/object:Gem::Requirement
|
@@ -127,7 +127,9 @@ email:
|
|
127
127
|
- email@jurriaanpruis.nl
|
128
128
|
executables: []
|
129
129
|
extensions: []
|
130
|
-
extra_rdoc_files:
|
130
|
+
extra_rdoc_files:
|
131
|
+
- README.md
|
132
|
+
- LICENSE
|
131
133
|
files:
|
132
134
|
- .coveralls.yml
|
133
135
|
- .gitignore
|
@@ -153,6 +155,7 @@ files:
|
|
153
155
|
- lib/docparser/output/yaml_output.rb
|
154
156
|
- lib/docparser/parser.rb
|
155
157
|
- lib/docparser/version.rb
|
158
|
+
- test/.rubocop.yml
|
156
159
|
- test/lib/docparser/blackbox_test.rb
|
157
160
|
- test/lib/docparser/document_test.rb
|
158
161
|
- test/lib/docparser/logging_test.rb
|
@@ -238,6 +241,7 @@ signing_key:
|
|
238
241
|
specification_version: 4
|
239
242
|
summary: DocParser is a Ruby Gem for webscraping
|
240
243
|
test_files:
|
244
|
+
- test/.rubocop.yml
|
241
245
|
- test/lib/docparser/blackbox_test.rb
|
242
246
|
- test/lib/docparser/document_test.rb
|
243
247
|
- test/lib/docparser/logging_test.rb
|