docparser 0.1.6 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 9b7bee15c24dec4a95cb7d36e724f127360eb3b1
4
- data.tar.gz: feee0df9857c9488bbd76d065f5fbdef781ec0b8
3
+ metadata.gz: 94ae9577e872555f07210deffe7c4ebc192ac617
4
+ data.tar.gz: cec0672efe1be1d5bab9d3bff313b4b5c8bc9513
5
5
  SHA512:
6
- metadata.gz: fc6523cc590e56df0e22440b40c301c513647e56fe5ca18fc98a6d849d2cd084be29e011d242db9ee9327475d49082915e7a80faaf4077fa6c4cb056b1f51184
7
- data.tar.gz: 99c44f0eabc3e58d6e463b3cac3ae0859ad448c647ccfecd73ab9e4eef9f709df18e79d15ac0ae19c511f0652a8aabca7608c610681fd63bb51bccb32e5335c7
6
+ metadata.gz: f1105cb8834ea0c03dd1e4672ae75792ef3a242b44a3a11d718ebcf444723ea7d0e42a64f23972c2c6faf1f7ddb5c7e10cb418094fad37d255c58ad3c1ebf0a2
7
+ data.tar.gz: 5afaaee3978d2ad2f50d980622540714645a02673f2ed9d3a645b415230141839dd32f31295c37a798427745a87bedb742a02c8eba11a6f379f0e6655514e58f
@@ -1,6 +1,7 @@
1
1
  language: ruby
2
2
  rvm:
3
3
  - 2.0.0
4
+ - 2.1.1
4
5
  - ruby-head
5
6
  - jruby-head
6
7
  matrix:
data/Gemfile CHANGED
@@ -1,13 +1,12 @@
1
- ruby '2.0.0'
2
1
  gemspec
3
2
 
4
3
  source 'https://rubygems.org'
5
4
 
6
5
  group :test do
7
- gem 'minitest', '~> 5.0.8'
6
+ gem 'minitest', '~> 5.3.0'
8
7
  gem 'coveralls', require: false
9
8
  gem 'rake'
10
- gem 'rubocop', '~> 0.13.1'
9
+ gem 'rubocop', '~> 0.18.1'
11
10
  gem 'simplecov', require: false
12
11
  gem 'simple_mock'
13
12
  end
data/README.md CHANGED
@@ -5,7 +5,7 @@
5
5
 
6
6
  DocParser is a web scraping/screen scraping tool.
7
7
 
8
- You can use it to easily scrape web sites.
8
+ You can use it to easily scrape information out of HTML documents.
9
9
 
10
10
  The gem is called [docparser](http://rubygems.org/gems/docparser).
11
11
  You can find the documentation [here](http://rubydoc.info/github/jurriaan/docparser/).
@@ -13,7 +13,6 @@ You can find the documentation [here](http://rubydoc.info/github/jurriaan/docpar
13
13
  ## Features
14
14
 
15
15
  - XPath and CSS support through Nokogiri
16
- - Support for loading of URLs throug open-uri
17
16
  - Support for parallel processing of the documents
18
17
  - 6 Output formats:
19
18
  * CSV
@@ -19,11 +19,9 @@ Gem::Specification.new do |spec|
19
19
  spec.require_paths = ['lib']
20
20
  spec.extra_rdoc_files = ['README.md', 'LICENSE']
21
21
 
22
- spec.add_runtime_dependency 'nokogiri', '~> 1.6.0'
23
- spec.add_runtime_dependency 'parallel', '~> 0.8.4'
22
+ spec.add_runtime_dependency 'nokogiri', '~> 1.6.1'
23
+ spec.add_runtime_dependency 'parallel', '~> 0.9.1'
24
24
  spec.add_runtime_dependency 'axlsx', '~> 2.0.1'
25
- spec.add_runtime_dependency 'terminal-table', '~> 1.4.5'
26
- spec.add_runtime_dependency 'pageme', '~> 0.0.3'
27
25
  spec.add_runtime_dependency 'log4r', '~> 1.1.10'
28
26
 
29
27
  spec.add_development_dependency 'yard'
@@ -4,7 +4,17 @@ module DocParser
4
4
  # @see Parser
5
5
  # @see Output
6
6
  class Document
7
- attr_reader :filename, :doc, :encoding, :results
7
+ # @return [String] the filename of the current document
8
+ attr_reader :filename
9
+
10
+ # @return [Nokogiri::HTML::Document] a reference to the Nokogiri document
11
+ attr_reader :doc
12
+
13
+ # @return [String] the encoding of the document
14
+ attr_reader :encoding
15
+
16
+ # @return [Array] the results from this document
17
+ attr_reader :results
8
18
 
9
19
  # @return [String] the source of the document
10
20
  attr_reader :html
@@ -9,7 +9,7 @@ module DocParser
9
9
  def initialize(filename: filename)
10
10
  @rowcount = 0
11
11
  @filename = filename
12
- raise ArgumentError, 'Please specify a filename' if filename.empty?
12
+ fail ArgumentError, 'Please specify a filename' if filename.empty?
13
13
  @file = open filename, 'w'
14
14
  classname = self.class.name.split('::').last
15
15
  @logger = Log4r::Logger.new("docparser::output::#{classname}")
@@ -49,7 +49,7 @@ module DocParser
49
49
 
50
50
  # Called when a row is added
51
51
  def write_row(row)
52
- raise NotImplementedError.new('No row writer defined')
52
+ fail NotImplementedError, 'No row writer defined'
53
53
  end
54
54
 
55
55
  # Called before closing the file
@@ -2,7 +2,6 @@ require 'csv'
2
2
  module DocParser
3
3
  # The CSVOutput class generates a CSV file containing all rows
4
4
  # @see Output
5
-
6
5
  class CSVOutput < Output
7
6
  # @!visibility private
8
7
  def open_file
@@ -11,7 +11,7 @@ module DocParser
11
11
  end
12
12
 
13
13
  def write_row(row)
14
- raise MissingHeaderException if @header.nil? || @header.length == 0
14
+ fail MissingHeaderException if @header.nil? || @header.length == 0
15
15
 
16
16
  @file << ',' unless @file.pos <= 1
17
17
 
@@ -6,7 +6,7 @@ module DocParser
6
6
  class YAMLOutput < Output
7
7
  # @!visibility private
8
8
  def write_row(row)
9
- raise MissingHeaderException if @header.nil? || @header.length == 0
9
+ fail MissingHeaderException if @header.nil? || @header.length == 0
10
10
  @doc ||= {}
11
11
 
12
12
  0.upto(@header.length - 1) do |counter|
@@ -1,6 +1,5 @@
1
1
  require 'rubygems'
2
2
  require 'bundler/setup'
3
- require 'open-uri'
4
3
  require 'parallel'
5
4
  require 'set'
6
5
  require 'log4r'
@@ -8,7 +7,6 @@ require 'log4r/formatter/patternformatter'
8
7
  require 'docparser/version'
9
8
  require 'docparser/output'
10
9
  require 'docparser/document'
11
- require 'docparser/output/screen_output.rb'
12
10
  require 'docparser/output/csv_output.rb'
13
11
  require 'docparser/output/html_output.rb'
14
12
  require 'docparser/output/xlsx_output.rb'
@@ -86,7 +84,7 @@ module DocParser
86
84
  elsif output.is_a?(Array) && output.all? { |o| o.is_a? Output }
87
85
  @outputs = output
88
86
  elsif !output.nil?
89
- raise ArgumentError, 'Invalid outputs specified'
87
+ fail ArgumentError, 'Invalid outputs specified'
90
88
  end
91
89
 
92
90
  @resultsets = Array.new(@outputs.length) { Set.new }
@@ -95,7 +93,7 @@ module DocParser
95
93
  def parallel_process(&block)
96
94
  @logger.info "Starting #{@num_processes} processes"
97
95
  option = RUBY_ENGINE == 'ruby' ? :in_processes : :in_threads
98
- Parallel.map(@files, { option => @num_processes }) do |file|
96
+ Parallel.map(@files, option => @num_processes) do |file|
99
97
  # :nocov: #
100
98
  parse_doc(file, &block)
101
99
  # :nocov: #
@@ -129,6 +127,5 @@ module DocParser
129
127
  output.close
130
128
  end
131
129
  end
132
-
133
130
  end
134
131
  end
@@ -2,5 +2,5 @@
2
2
  # See README.md for information on using DocParser
3
3
  module DocParser
4
4
  # The current version of DocParser
5
- VERSION = '0.1.6'
5
+ VERSION = '0.2.0'
6
6
  end
@@ -27,12 +27,6 @@ describe DocParser::Document do
27
27
  doc.xpath_content('xmltest > test').must_equal('Character Data')
28
28
  end
29
29
 
30
- it 'should read remote contents' do
31
- url = 'https://gist.github.com/jurriaan/3f2750aa546e3e6719cf/raw'
32
- doc = DocParser::Document.new(filename: url, parser: @parser)
33
- doc.html.must_equal(open(url).read)
34
- end
35
-
36
30
  it 'should use the correct encoding' do
37
31
  file = File.join($SUPPORT_DIR, 'test_encoding.html')
38
32
  file2 = File.join($SUPPORT_DIR, 'test_encoding2.html')
@@ -1,7 +1,7 @@
1
1
  require_relative '../../test_helper'
2
2
  describe DocParser::Parser do
3
3
  before do
4
- SimpleCov.at_exit { }
4
+ SimpleCov.at_exit {}
5
5
  end
6
6
 
7
7
  after do
@@ -8,6 +8,7 @@ SimpleCov.start do
8
8
  # add_filter '/test/'
9
9
  end
10
10
  require 'minitest/autorun'
11
+ require 'minitest/hell'
11
12
  require 'minitest/pride'
12
13
  require 'tempfile'
13
14
  require 'tmpdir'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: docparser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.6
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jurriaan Pruis
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-09-30 00:00:00.000000000 Z
11
+ date: 2014-03-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -16,28 +16,28 @@ dependencies:
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: 1.6.0
19
+ version: 1.6.1
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: 1.6.0
26
+ version: 1.6.1
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: parallel
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
31
  - - "~>"
32
32
  - !ruby/object:Gem::Version
33
- version: 0.8.4
33
+ version: 0.9.1
34
34
  type: :runtime
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
- version: 0.8.4
40
+ version: 0.9.1
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: axlsx
43
43
  requirement: !ruby/object:Gem::Requirement
@@ -52,34 +52,6 @@ dependencies:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
54
  version: 2.0.1
55
- - !ruby/object:Gem::Dependency
56
- name: terminal-table
57
- requirement: !ruby/object:Gem::Requirement
58
- requirements:
59
- - - "~>"
60
- - !ruby/object:Gem::Version
61
- version: 1.4.5
62
- type: :runtime
63
- prerelease: false
64
- version_requirements: !ruby/object:Gem::Requirement
65
- requirements:
66
- - - "~>"
67
- - !ruby/object:Gem::Version
68
- version: 1.4.5
69
- - !ruby/object:Gem::Dependency
70
- name: pageme
71
- requirement: !ruby/object:Gem::Requirement
72
- requirements:
73
- - - "~>"
74
- - !ruby/object:Gem::Version
75
- version: 0.0.3
76
- type: :runtime
77
- prerelease: false
78
- version_requirements: !ruby/object:Gem::Requirement
79
- requirements:
80
- - - "~>"
81
- - !ruby/object:Gem::Version
82
- version: 0.0.3
83
55
  - !ruby/object:Gem::Dependency
84
56
  name: log4r
85
57
  requirement: !ruby/object:Gem::Requirement
@@ -164,7 +136,6 @@ files:
164
136
  - lib/docparser/output/json_output.rb
165
137
  - lib/docparser/output/multi_output.rb
166
138
  - lib/docparser/output/nil_output.rb
167
- - lib/docparser/output/screen_output.rb
168
139
  - lib/docparser/output/xlsx_output.rb
169
140
  - lib/docparser/output/yaml_output.rb
170
141
  - lib/docparser/parser.rb
@@ -178,7 +149,6 @@ files:
178
149
  - test/lib/docparser/output/json_output_test.rb
179
150
  - test/lib/docparser/output/multi_output_test.rb
180
151
  - test/lib/docparser/output/nil_output_test.rb
181
- - test/lib/docparser/output/screen_output_test.rb
182
152
  - test/lib/docparser/output/xlsx_output_test.rb
183
153
  - test/lib/docparser/output/yaml_output_test.rb
184
154
  - test/lib/docparser/output_test.rb
@@ -250,7 +220,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
250
220
  version: '0'
251
221
  requirements: []
252
222
  rubyforge_project:
253
- rubygems_version: 2.0.3
223
+ rubygems_version: 2.2.2
254
224
  signing_key:
255
225
  specification_version: 4
256
226
  summary: DocParser is a Ruby Gem for webscraping
@@ -264,7 +234,6 @@ test_files:
264
234
  - test/lib/docparser/output/json_output_test.rb
265
235
  - test/lib/docparser/output/multi_output_test.rb
266
236
  - test/lib/docparser/output/nil_output_test.rb
267
- - test/lib/docparser/output/screen_output_test.rb
268
237
  - test/lib/docparser/output/xlsx_output_test.rb
269
238
  - test/lib/docparser/output/yaml_output_test.rb
270
239
  - test/lib/docparser/output_test.rb
@@ -1,37 +0,0 @@
1
- require 'terminal-table'
2
- require 'pageme'
3
- module DocParser
4
- # This Output can be used for debugging purposes.
5
-
6
- # This output sends the results directly to the terminal and pipes all rows
7
- # through a pager
8
- # @see Output
9
- class ScreenOutput < Output
10
- # @!visibility private
11
-
12
- include PageMe
13
-
14
- def initialize
15
- @tables = []
16
- @rowcount = 0
17
- end
18
-
19
- def close
20
- page do |p|
21
- p.puts "Showing all #{@tables.length} rows:\n\n"
22
- @tables.each do |table|
23
- p.puts table
24
- end
25
- end
26
- end
27
-
28
- def write_row(row)
29
- raise MissingHeaderException if @header.nil? || @header.length == 0
30
- out = []
31
- 0.upto(@header.length - 1) do |counter|
32
- out << [@header[counter], row[counter]]
33
- end
34
- @tables << Terminal::Table.new(rows: out)
35
- end
36
- end
37
- end
@@ -1,54 +0,0 @@
1
- require_relative '../../../test_helper'
2
- require 'stringio'
3
- describe DocParser::ScreenOutput do
4
- before do
5
- Log4r::Logger['docparser'].level = Log4r::ERROR
6
- end
7
- after do
8
- Log4r::Logger['docparser'].level = Log4r::INFO
9
- end
10
-
11
- it 'should not create a file' do
12
- Dir.mktmpdir do |dir|
13
- filename = File.join(dir, '*')
14
- DocParser::ScreenOutput.new
15
- Dir[filename].must_be_empty
16
- end
17
- end
18
-
19
- it 'must give the correct rowcount' do
20
- output = DocParser::ScreenOutput.new
21
- output.header = 'test', 'the', 'header'
22
- output.rowcount.must_equal 0
23
- output.add_row %w(aap noot mies)
24
- output.add_row %w(aap noot mies)
25
- output.rowcount.must_equal 2
26
- end
27
-
28
- it 'must have a header' do
29
- output = DocParser::ScreenOutput.new
30
- lambda do
31
- output.add_row %w(aap noot mies)
32
- end.must_raise(DocParser::MissingHeaderException)
33
- end
34
-
35
- it 'must output the data after close' do
36
- $out = StringIO.new
37
- output = Class.new DocParser::ScreenOutput do
38
- def page(*args, &p)
39
- args << p
40
- args.compact!
41
- page_to $out, args
42
- end
43
- end.new
44
- output.header = 'test', 'the', 'header'
45
- output.add_row ['aap1' , '', 'mies']
46
- output.add_row %w(aap2 mies1)
47
- output.close
48
- out = $out.string
49
- out.must_include 'header'
50
- out.must_include 'aap1'
51
- out.must_include 'mies1'
52
- out.must_include 'mies'
53
- end
54
- end