docparser 0.1.6 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 9b7bee15c24dec4a95cb7d36e724f127360eb3b1
4
- data.tar.gz: feee0df9857c9488bbd76d065f5fbdef781ec0b8
3
+ metadata.gz: 94ae9577e872555f07210deffe7c4ebc192ac617
4
+ data.tar.gz: cec0672efe1be1d5bab9d3bff313b4b5c8bc9513
5
5
  SHA512:
6
- metadata.gz: fc6523cc590e56df0e22440b40c301c513647e56fe5ca18fc98a6d849d2cd084be29e011d242db9ee9327475d49082915e7a80faaf4077fa6c4cb056b1f51184
7
- data.tar.gz: 99c44f0eabc3e58d6e463b3cac3ae0859ad448c647ccfecd73ab9e4eef9f709df18e79d15ac0ae19c511f0652a8aabca7608c610681fd63bb51bccb32e5335c7
6
+ metadata.gz: f1105cb8834ea0c03dd1e4672ae75792ef3a242b44a3a11d718ebcf444723ea7d0e42a64f23972c2c6faf1f7ddb5c7e10cb418094fad37d255c58ad3c1ebf0a2
7
+ data.tar.gz: 5afaaee3978d2ad2f50d980622540714645a02673f2ed9d3a645b415230141839dd32f31295c37a798427745a87bedb742a02c8eba11a6f379f0e6655514e58f
@@ -1,6 +1,7 @@
1
1
  language: ruby
2
2
  rvm:
3
3
  - 2.0.0
4
+ - 2.1.1
4
5
  - ruby-head
5
6
  - jruby-head
6
7
  matrix:
data/Gemfile CHANGED
@@ -1,13 +1,12 @@
1
- ruby '2.0.0'
2
1
  gemspec
3
2
 
4
3
  source 'https://rubygems.org'
5
4
 
6
5
  group :test do
7
- gem 'minitest', '~> 5.0.8'
6
+ gem 'minitest', '~> 5.3.0'
8
7
  gem 'coveralls', require: false
9
8
  gem 'rake'
10
- gem 'rubocop', '~> 0.13.1'
9
+ gem 'rubocop', '~> 0.18.1'
11
10
  gem 'simplecov', require: false
12
11
  gem 'simple_mock'
13
12
  end
data/README.md CHANGED
@@ -5,7 +5,7 @@
5
5
 
6
6
  DocParser is a web scraping/screen scraping tool.
7
7
 
8
- You can use it to easily scrape web sites.
8
+ You can use it to easily scrape information out of HTML documents.
9
9
 
10
10
  The gem is called [docparser](http://rubygems.org/gems/docparser).
11
11
  You can find the documentation [here](http://rubydoc.info/github/jurriaan/docparser/).
@@ -13,7 +13,6 @@ You can find the documentation [here](http://rubydoc.info/github/jurriaan/docpar
13
13
  ## Features
14
14
 
15
15
  - XPath and CSS support through Nokogiri
16
- - Support for loading of URLs throug open-uri
17
16
  - Support for parallel processing of the documents
18
17
  - 6 Output formats:
19
18
  * CSV
@@ -19,11 +19,9 @@ Gem::Specification.new do |spec|
19
19
  spec.require_paths = ['lib']
20
20
  spec.extra_rdoc_files = ['README.md', 'LICENSE']
21
21
 
22
- spec.add_runtime_dependency 'nokogiri', '~> 1.6.0'
23
- spec.add_runtime_dependency 'parallel', '~> 0.8.4'
22
+ spec.add_runtime_dependency 'nokogiri', '~> 1.6.1'
23
+ spec.add_runtime_dependency 'parallel', '~> 0.9.1'
24
24
  spec.add_runtime_dependency 'axlsx', '~> 2.0.1'
25
- spec.add_runtime_dependency 'terminal-table', '~> 1.4.5'
26
- spec.add_runtime_dependency 'pageme', '~> 0.0.3'
27
25
  spec.add_runtime_dependency 'log4r', '~> 1.1.10'
28
26
 
29
27
  spec.add_development_dependency 'yard'
@@ -4,7 +4,17 @@ module DocParser
4
4
  # @see Parser
5
5
  # @see Output
6
6
  class Document
7
- attr_reader :filename, :doc, :encoding, :results
7
+ # @return [String] the filename of the current document
8
+ attr_reader :filename
9
+
10
+ # @return [Nokogiri::HTML::Document] a reference to the Nokogiri document
11
+ attr_reader :doc
12
+
13
+ # @return [String] the encoding of the document
14
+ attr_reader :encoding
15
+
16
+ # @return [Array] the results from this document
17
+ attr_reader :results
8
18
 
9
19
  # @return [String] the source of the document
10
20
  attr_reader :html
@@ -9,7 +9,7 @@ module DocParser
9
9
  def initialize(filename: filename)
10
10
  @rowcount = 0
11
11
  @filename = filename
12
- raise ArgumentError, 'Please specify a filename' if filename.empty?
12
+ fail ArgumentError, 'Please specify a filename' if filename.empty?
13
13
  @file = open filename, 'w'
14
14
  classname = self.class.name.split('::').last
15
15
  @logger = Log4r::Logger.new("docparser::output::#{classname}")
@@ -49,7 +49,7 @@ module DocParser
49
49
 
50
50
  # Called when a row is added
51
51
  def write_row(row)
52
- raise NotImplementedError.new('No row writer defined')
52
+ fail NotImplementedError, 'No row writer defined'
53
53
  end
54
54
 
55
55
  # Called before closing the file
@@ -2,7 +2,6 @@ require 'csv'
2
2
  module DocParser
3
3
  # The CSVOutput class generates a CSV file containing all rows
4
4
  # @see Output
5
-
6
5
  class CSVOutput < Output
7
6
  # @!visibility private
8
7
  def open_file
@@ -11,7 +11,7 @@ module DocParser
11
11
  end
12
12
 
13
13
  def write_row(row)
14
- raise MissingHeaderException if @header.nil? || @header.length == 0
14
+ fail MissingHeaderException if @header.nil? || @header.length == 0
15
15
 
16
16
  @file << ',' unless @file.pos <= 1
17
17
 
@@ -6,7 +6,7 @@ module DocParser
6
6
  class YAMLOutput < Output
7
7
  # @!visibility private
8
8
  def write_row(row)
9
- raise MissingHeaderException if @header.nil? || @header.length == 0
9
+ fail MissingHeaderException if @header.nil? || @header.length == 0
10
10
  @doc ||= {}
11
11
 
12
12
  0.upto(@header.length - 1) do |counter|
@@ -1,6 +1,5 @@
1
1
  require 'rubygems'
2
2
  require 'bundler/setup'
3
- require 'open-uri'
4
3
  require 'parallel'
5
4
  require 'set'
6
5
  require 'log4r'
@@ -8,7 +7,6 @@ require 'log4r/formatter/patternformatter'
8
7
  require 'docparser/version'
9
8
  require 'docparser/output'
10
9
  require 'docparser/document'
11
- require 'docparser/output/screen_output.rb'
12
10
  require 'docparser/output/csv_output.rb'
13
11
  require 'docparser/output/html_output.rb'
14
12
  require 'docparser/output/xlsx_output.rb'
@@ -86,7 +84,7 @@ module DocParser
86
84
  elsif output.is_a?(Array) && output.all? { |o| o.is_a? Output }
87
85
  @outputs = output
88
86
  elsif !output.nil?
89
- raise ArgumentError, 'Invalid outputs specified'
87
+ fail ArgumentError, 'Invalid outputs specified'
90
88
  end
91
89
 
92
90
  @resultsets = Array.new(@outputs.length) { Set.new }
@@ -95,7 +93,7 @@ module DocParser
95
93
  def parallel_process(&block)
96
94
  @logger.info "Starting #{@num_processes} processes"
97
95
  option = RUBY_ENGINE == 'ruby' ? :in_processes : :in_threads
98
- Parallel.map(@files, { option => @num_processes }) do |file|
96
+ Parallel.map(@files, option => @num_processes) do |file|
99
97
  # :nocov: #
100
98
  parse_doc(file, &block)
101
99
  # :nocov: #
@@ -129,6 +127,5 @@ module DocParser
129
127
  output.close
130
128
  end
131
129
  end
132
-
133
130
  end
134
131
  end
@@ -2,5 +2,5 @@
2
2
  # See README.md for information on using DocParser
3
3
  module DocParser
4
4
  # The current version of DocParser
5
- VERSION = '0.1.6'
5
+ VERSION = '0.2.0'
6
6
  end
@@ -27,12 +27,6 @@ describe DocParser::Document do
27
27
  doc.xpath_content('xmltest > test').must_equal('Character Data')
28
28
  end
29
29
 
30
- it 'should read remote contents' do
31
- url = 'https://gist.github.com/jurriaan/3f2750aa546e3e6719cf/raw'
32
- doc = DocParser::Document.new(filename: url, parser: @parser)
33
- doc.html.must_equal(open(url).read)
34
- end
35
-
36
30
  it 'should use the correct encoding' do
37
31
  file = File.join($SUPPORT_DIR, 'test_encoding.html')
38
32
  file2 = File.join($SUPPORT_DIR, 'test_encoding2.html')
@@ -1,7 +1,7 @@
1
1
  require_relative '../../test_helper'
2
2
  describe DocParser::Parser do
3
3
  before do
4
- SimpleCov.at_exit { }
4
+ SimpleCov.at_exit {}
5
5
  end
6
6
 
7
7
  after do
@@ -8,6 +8,7 @@ SimpleCov.start do
8
8
  # add_filter '/test/'
9
9
  end
10
10
  require 'minitest/autorun'
11
+ require 'minitest/hell'
11
12
  require 'minitest/pride'
12
13
  require 'tempfile'
13
14
  require 'tmpdir'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: docparser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.6
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jurriaan Pruis
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-09-30 00:00:00.000000000 Z
11
+ date: 2014-03-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -16,28 +16,28 @@ dependencies:
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: 1.6.0
19
+ version: 1.6.1
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: 1.6.0
26
+ version: 1.6.1
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: parallel
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
31
  - - "~>"
32
32
  - !ruby/object:Gem::Version
33
- version: 0.8.4
33
+ version: 0.9.1
34
34
  type: :runtime
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
- version: 0.8.4
40
+ version: 0.9.1
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: axlsx
43
43
  requirement: !ruby/object:Gem::Requirement
@@ -52,34 +52,6 @@ dependencies:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
54
  version: 2.0.1
55
- - !ruby/object:Gem::Dependency
56
- name: terminal-table
57
- requirement: !ruby/object:Gem::Requirement
58
- requirements:
59
- - - "~>"
60
- - !ruby/object:Gem::Version
61
- version: 1.4.5
62
- type: :runtime
63
- prerelease: false
64
- version_requirements: !ruby/object:Gem::Requirement
65
- requirements:
66
- - - "~>"
67
- - !ruby/object:Gem::Version
68
- version: 1.4.5
69
- - !ruby/object:Gem::Dependency
70
- name: pageme
71
- requirement: !ruby/object:Gem::Requirement
72
- requirements:
73
- - - "~>"
74
- - !ruby/object:Gem::Version
75
- version: 0.0.3
76
- type: :runtime
77
- prerelease: false
78
- version_requirements: !ruby/object:Gem::Requirement
79
- requirements:
80
- - - "~>"
81
- - !ruby/object:Gem::Version
82
- version: 0.0.3
83
55
  - !ruby/object:Gem::Dependency
84
56
  name: log4r
85
57
  requirement: !ruby/object:Gem::Requirement
@@ -164,7 +136,6 @@ files:
164
136
  - lib/docparser/output/json_output.rb
165
137
  - lib/docparser/output/multi_output.rb
166
138
  - lib/docparser/output/nil_output.rb
167
- - lib/docparser/output/screen_output.rb
168
139
  - lib/docparser/output/xlsx_output.rb
169
140
  - lib/docparser/output/yaml_output.rb
170
141
  - lib/docparser/parser.rb
@@ -178,7 +149,6 @@ files:
178
149
  - test/lib/docparser/output/json_output_test.rb
179
150
  - test/lib/docparser/output/multi_output_test.rb
180
151
  - test/lib/docparser/output/nil_output_test.rb
181
- - test/lib/docparser/output/screen_output_test.rb
182
152
  - test/lib/docparser/output/xlsx_output_test.rb
183
153
  - test/lib/docparser/output/yaml_output_test.rb
184
154
  - test/lib/docparser/output_test.rb
@@ -250,7 +220,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
250
220
  version: '0'
251
221
  requirements: []
252
222
  rubyforge_project:
253
- rubygems_version: 2.0.3
223
+ rubygems_version: 2.2.2
254
224
  signing_key:
255
225
  specification_version: 4
256
226
  summary: DocParser is a Ruby Gem for webscraping
@@ -264,7 +234,6 @@ test_files:
264
234
  - test/lib/docparser/output/json_output_test.rb
265
235
  - test/lib/docparser/output/multi_output_test.rb
266
236
  - test/lib/docparser/output/nil_output_test.rb
267
- - test/lib/docparser/output/screen_output_test.rb
268
237
  - test/lib/docparser/output/xlsx_output_test.rb
269
238
  - test/lib/docparser/output/yaml_output_test.rb
270
239
  - test/lib/docparser/output_test.rb
@@ -1,37 +0,0 @@
1
- require 'terminal-table'
2
- require 'pageme'
3
- module DocParser
4
- # This Output can be used for debugging purposes.
5
-
6
- # This output sends the results directly to the terminal and pipes all rows
7
- # through a pager
8
- # @see Output
9
- class ScreenOutput < Output
10
- # @!visibility private
11
-
12
- include PageMe
13
-
14
- def initialize
15
- @tables = []
16
- @rowcount = 0
17
- end
18
-
19
- def close
20
- page do |p|
21
- p.puts "Showing all #{@tables.length} rows:\n\n"
22
- @tables.each do |table|
23
- p.puts table
24
- end
25
- end
26
- end
27
-
28
- def write_row(row)
29
- raise MissingHeaderException if @header.nil? || @header.length == 0
30
- out = []
31
- 0.upto(@header.length - 1) do |counter|
32
- out << [@header[counter], row[counter]]
33
- end
34
- @tables << Terminal::Table.new(rows: out)
35
- end
36
- end
37
- end
@@ -1,54 +0,0 @@
1
- require_relative '../../../test_helper'
2
- require 'stringio'
3
- describe DocParser::ScreenOutput do
4
- before do
5
- Log4r::Logger['docparser'].level = Log4r::ERROR
6
- end
7
- after do
8
- Log4r::Logger['docparser'].level = Log4r::INFO
9
- end
10
-
11
- it 'should not create a file' do
12
- Dir.mktmpdir do |dir|
13
- filename = File.join(dir, '*')
14
- DocParser::ScreenOutput.new
15
- Dir[filename].must_be_empty
16
- end
17
- end
18
-
19
- it 'must give the correct rowcount' do
20
- output = DocParser::ScreenOutput.new
21
- output.header = 'test', 'the', 'header'
22
- output.rowcount.must_equal 0
23
- output.add_row %w(aap noot mies)
24
- output.add_row %w(aap noot mies)
25
- output.rowcount.must_equal 2
26
- end
27
-
28
- it 'must have a header' do
29
- output = DocParser::ScreenOutput.new
30
- lambda do
31
- output.add_row %w(aap noot mies)
32
- end.must_raise(DocParser::MissingHeaderException)
33
- end
34
-
35
- it 'must output the data after close' do
36
- $out = StringIO.new
37
- output = Class.new DocParser::ScreenOutput do
38
- def page(*args, &p)
39
- args << p
40
- args.compact!
41
- page_to $out, args
42
- end
43
- end.new
44
- output.header = 'test', 'the', 'header'
45
- output.add_row ['aap1' , '', 'mies']
46
- output.add_row %w(aap2 mies1)
47
- output.close
48
- out = $out.string
49
- out.must_include 'header'
50
- out.must_include 'aap1'
51
- out.must_include 'mies1'
52
- out.must_include 'mies'
53
- end
54
- end