docparser 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 829c9585c7ac42c2496a8b385f2f19d4486a5e10
4
+ data.tar.gz: 39f576740941b72e61babec808af6756adfeb1b8
5
+ SHA512:
6
+ metadata.gz: 32a9dc03bb9f413641b25cb0e51c9951fa8e1a64a48f4b1c8cc8e1e3877cbaa87b1ac70738f85c20f7383a3446d5a37915ee53a09fd30099b00be553779f4f4a
7
+ data.tar.gz: b106a3fc8843a2d47f9d8c4f5ac65e39afc702fff8dcabfdfe89e0477731ac1c9576e9b795dcda53c927c8acc718846b2bb277a0e7e68654ead52cb97e612b3e
data/.gitignore ADDED
@@ -0,0 +1,18 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+ hackaday.*
data/.yardops ADDED
@@ -0,0 +1,2 @@
1
+ README.md
2
+ LICENSE
data/Gemfile ADDED
@@ -0,0 +1,9 @@
1
+ ruby '2.0.0'
2
+
3
+ source 'https://rubygems.org'
4
+ gem 'nokogiri'
5
+ gem 'parallel'
6
+ gem 'axlsx'
7
+ gem 'terminal-table'
8
+ gem 'pageme'
9
+ gem "json"
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Jurriaan Pruis
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,52 @@
1
+ # DocParser
2
+
3
+ Docs: http://rubydoc.info/github/jurriaan/docparser/
4
+
5
+ DocParser is a web scraping/screen scraping tool.
6
+ You can use it to easily scrape web sites.
7
+
8
+ ## Features
9
+
10
+ - XPath and CSS support through Nokogiri
11
+ - Support for loading of URLs throug open-uri
12
+ - Support for parallel processing of the documents
13
+ - 5 Output formats:
14
+ * CSV
15
+ * XLSX
16
+ * HTML
17
+ * YAML
18
+ * Screen (for debugging and development)
19
+
20
+ ## Installation
21
+
22
+ Add this line to your application's Gemfile:
23
+
24
+ gem 'docparser'
25
+
26
+ And then execute:
27
+
28
+ $ bundle
29
+
30
+ Or install it yourself as:
31
+
32
+ $ gem install docparser
33
+
34
+ ## Usage
35
+
36
+ See example.rb
37
+
38
+ ## Todo
39
+
40
+ - Tests
41
+
42
+ ## Contributing
43
+
44
+ 1. Fork it
45
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
46
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
47
+ 4. Push to the branch (`git push origin my-new-feature`)
48
+ 5. Create new Pull Request
49
+
50
+ ## Contributors
51
+
52
+ - Jurriaan Pruis
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
data/docparser.gemspec ADDED
@@ -0,0 +1,23 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'docparser/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "docparser"
8
+ spec.version = DocParser::VERSION
9
+ spec.authors = ["Jurriaan Pruis"]
10
+ spec.email = ["email@jurriaanpruis.nl"]
11
+ spec.description = %q{DocParser is a Ruby Gem for webscraping}
12
+ spec.summary = %q{DocParser is a Ruby Gem for webscraping}
13
+ spec.homepage = "https://github.com/jurriaan/docparser"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.3"
22
+ spec.add_development_dependency "rake"
23
+ end
data/example.rb ADDED
@@ -0,0 +1,22 @@
1
+ #
2
+
3
+ # An example of parsing a popular dutch website..
4
+ # (C) 2013 Jurriaan Pruis
5
+ #
6
+
7
+ require 'docparser'
8
+ include DocParser
9
+ output = HTMLOutput.new filename: 'hackaday.html'
10
+ output.header = 'Title', 'Author', 'Publication date', 'URL', 'Summary'
11
+ parser = Parser.new(files: (1..20).map {|i| "http://hackaday.com/page/#{i}/"}, parallel: false, output: output)
12
+ parser.parse! do
13
+ css('#content .post') do |post|
14
+ title_el = post.search('.entry-title a').first
15
+ title = title_el.content
16
+ author =post.search('.post-info .author .fn a').first.content
17
+ published_time = post.search('.post-info .date.published').first.content
18
+ url = title_el.attributes['href']
19
+ summary = post.search('.entry-content').first.content.strip
20
+ add_row title, author, published_time, url, summary
21
+ end
22
+ end
data/lib/docparser.rb ADDED
@@ -0,0 +1 @@
1
+ require 'docparser/parser'
@@ -0,0 +1,79 @@
1
+ require 'set'
2
+ module DocParser
3
+ # The Document class loads and parses the files.
4
+ # @see Parser
5
+ # @see Output
6
+ class Document
7
+ attr_reader :filename, :doc, :encoding, :results
8
+ def initialize(filename, encoding: 'utf-8', parser: nil)
9
+ if encoding == 'utf-8'
10
+ encodingstring = 'r:utf-8'
11
+ else
12
+ encodingstring = "r:#{encoding}:utf-8"
13
+ end
14
+
15
+ open(filename, encodingstring) do |f|
16
+ @doc = Nokogiri::HTML(f)
17
+ end
18
+
19
+ @encoding = encoding
20
+ @parser = parser
21
+ @filename = filename
22
+ @results = Array.new(@parser.outputs.length) { [] }
23
+ end
24
+
25
+ # Adds a row to an output
26
+ def add_row(*row, output: 0)
27
+ output = @parser.outputs.index(output) if output.is_a? Output
28
+ results[output] << row.flatten
29
+ end
30
+
31
+ # Extracts the document title
32
+ # @return [String] the title of the document
33
+ def title
34
+ @title ||= xpath_content('//head/title')
35
+ end
36
+
37
+ # @return [String] the source of the document
38
+ def html
39
+ @html ||= @doc.inner_html #TODO: ??
40
+ end
41
+
42
+ # Executes a xpath query
43
+ def xpath(query)
44
+ res = @doc.search(query)
45
+ res.each { |el| yield el } if block_given?
46
+ end
47
+
48
+ # Executes a xpath query and returns the content
49
+ # @return [String] the content of the HTML node
50
+ def xpath_content(query)
51
+ first = @doc.search(query).first
52
+ if first.nil?
53
+ nil
54
+ else
55
+ first.content
56
+ end
57
+ end
58
+
59
+ # Matches the HTML source using a regular expression
60
+ def regexp(regexp)
61
+ html.match(regexp) rescue nil
62
+ end
63
+
64
+ # Parses the document
65
+ # @return [Array] containing the parse results
66
+ def parse!(&block)
67
+ instance_exec(&block)
68
+ results
69
+ end
70
+
71
+ # @!visibility private
72
+ def inspect
73
+ "<Document file:'#{@filename}'>"
74
+ end
75
+
76
+ alias :css :xpath
77
+ alias :css_content :xpath_content
78
+ end
79
+ end
@@ -0,0 +1,62 @@
1
+ module DocParser
2
+ # The Output base class.
3
+ # All Output classes inherit from this one.
4
+ class Output
5
+ attr_reader :rowcount
6
+
7
+ # Creates a new output
8
+ # @param filename [String] Output filename
9
+ def initialize(filename: filename)
10
+ @rowcount = 0
11
+ @filename = filename
12
+ raise ArgumentError, 'Please specify a filename' if filename.empty?
13
+ @file = open filename, 'w'
14
+ open_file
15
+ end
16
+
17
+ # Stores the header
18
+ def header=(row)
19
+ @header = row
20
+ header
21
+ end
22
+
23
+ # Adds a row
24
+ def add_row(row)
25
+ @rowcount += 1
26
+ write_row row
27
+ end
28
+
29
+ # Closes output and IO
30
+ def close
31
+ footer
32
+ @file.close unless @file.closed?
33
+ end
34
+
35
+ # Called after the file is opened
36
+ def open_file
37
+ # do nothing
38
+ end
39
+
40
+ # Called after header is set
41
+ def header
42
+ # do nothing
43
+ end
44
+
45
+ # Called when a row is added
46
+ def write_row(row)
47
+ raise 'No row writer defined'
48
+ end
49
+
50
+ # Called before closing the file
51
+ def footer
52
+ end
53
+
54
+ # Displays information about the output
55
+ # @return [String] containing number of rows and file size
56
+ def summary
57
+ "%s:\t%d rows, %9.2f KiB" % [@filename,
58
+ @rowcount,
59
+ File.size(@filename) / 1024.0]
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,20 @@
1
+ require 'csv'
2
+ module DocParser
3
+ # The CSVOutput class generates a CSV file containing all rows
4
+ # @see Output
5
+
6
+ class CSVOutput < Output
7
+ # @!visibility private
8
+ def open_file
9
+ @csv = CSV.new(@file, col_sep: ';')
10
+ end
11
+
12
+ def header
13
+ write_row @header
14
+ end
15
+
16
+ def write_row(row)
17
+ @csv << row
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,78 @@
1
+ require 'cgi'
2
+ module DocParser
3
+ # The XLSXOutput class generates an HTML file containing a table
4
+ # @see Output
5
+ class HTMLOutput < Output
6
+ # @!visibility private
7
+ HTMLHEADER = <<-EOS
8
+ <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
9
+ "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
10
+ <html>
11
+ <head>
12
+ <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
13
+ <title>HTML output "#FILENAME#"</title>
14
+ <style type="text/css">
15
+ body {
16
+ font-family:"Helvetica Neue", Helvetica, Sans-Serif;
17
+ font-size:12px;
18
+ }
19
+ table {
20
+ border:1px solid #69c;
21
+ border-collapse:collapse;
22
+ font-size:12px;
23
+ text-align:left;
24
+ width:480px;
25
+ }
26
+ th {
27
+ border-bottom:1px dashed #69c;
28
+ color:#039;
29
+ font-size:14px;
30
+ font-weight:normal;
31
+ padding:12px 17px;
32
+ }
33
+ td {
34
+ color:#669;
35
+ padding:7px 17px;
36
+ white-space: pre;
37
+ }
38
+ tbody tr:hover td {
39
+ background:#d0dafd;
40
+ color:#339;
41
+ }
42
+ tbody tr:nth-child(even) {
43
+ background:#e0eaff;
44
+ }
45
+ </style>
46
+ </head>
47
+ <body>
48
+ <table>
49
+ EOS
50
+ # @!visibility private
51
+ HTMLFOOTER = <<-EOS
52
+ </tbody>
53
+ </table>
54
+ <p>#COUNT# rows</p>
55
+ </body>
56
+ </html>
57
+ EOS
58
+ def open_file
59
+ @file << HTMLHEADER.gsub('#FILENAME#', @filename)
60
+ end
61
+
62
+ def header
63
+ @file << '<thead><tr>'
64
+ @file << @header.map { |f| '<th>' + f + '</th>' }.join
65
+ @file << "</tr></thead>\n<tbody>\n"
66
+ end
67
+
68
+ def write_row(row)
69
+ @file << '<tr>'
70
+ @file << row.map { |f| '<td>' + CGI.escapeHTML(f.to_s) + '</td>' }.join
71
+ @file << "</tr>\n"
72
+ end
73
+
74
+ def footer
75
+ @file << HTMLFOOTER.gsub('#COUNT#', @rowcount.to_s)
76
+ end
77
+ end
78
+ end
@@ -0,0 +1,30 @@
1
+ require 'json'
2
+ module DocParser
3
+ # The JSONOutput class generates a JSON file containing all rows as seperate
4
+ # JSON documents
5
+ # @see Output
6
+ class JSONOutput < Output
7
+ # @!visibility private
8
+ def open_file
9
+ @file << '['
10
+ @first = true
11
+ @doc = {}
12
+ end
13
+
14
+ def write_row(row)
15
+ if @first
16
+ @first = false
17
+ else
18
+ @file << ','
19
+ end
20
+ 0.upto(@header.length - 1) do |counter|
21
+ @doc[@header[counter]] = row[counter] rescue ''
22
+ end
23
+ @file << JSON.dump(@doc)
24
+ end
25
+
26
+ def close
27
+ @file << ']'
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,50 @@
1
+ module DocParser
2
+ # The MultiOutput output combines multiple outputs.
3
+ # It creates a CSV, HTML, YAML and XLSX Output file
4
+ # @see CSVOutput
5
+ # @see HTMLOutput
6
+ # @see YAMLOutput
7
+ # @see XLSXOutput
8
+ # @see Output
9
+ class MultiOutput < Output
10
+ # @!visibility private
11
+ def initialize(**options)
12
+ @outputs = []
13
+ csvoptions = options.clone
14
+ csvoptions[:filename] += '.csv'
15
+ htmloptions = options.clone
16
+ htmloptions[:filename] += '.html'
17
+ yamloptions = options.clone
18
+ yamloptions[:filename] += '.yml'
19
+ xlsxoptions = options.clone
20
+ xlsxoptions[:filename] += '.xlsx'
21
+ jsonoptions = options.clone
22
+ jsonoptions[:filename] += '.json'
23
+ @outputs << CSVOutput.new(csvoptions)
24
+ @outputs << HTMLOutput.new(htmloptions)
25
+ @outputs << YAMLOutput.new(yamloptions)
26
+ @outputs << XLSXOutput.new(xlsxoptions)
27
+ @outputs << XLSXOutput.new(jsonoptions)
28
+ end
29
+
30
+ def header=(row)
31
+ @outputs.each { |out| out.header = row.flatten }
32
+ end
33
+
34
+ def add_row(row)
35
+ @outputs.each { |out| out.add_row row.flatten }
36
+ end
37
+
38
+ def rowcount
39
+ @outputs.min { |out| out.rowcount }.rowcount
40
+ end
41
+
42
+ def close
43
+ @outputs.each { |out| out.close }
44
+ end
45
+
46
+ def summary
47
+ @outputs.map { |out| out.summary }.join("\n")
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,35 @@
1
+ require 'terminal-table'
2
+ require 'pageme'
3
+ module DocParser
4
+ # This Output is can be used for debugging purposes.
5
+
6
+ # It pipes all rows through a pager
7
+ # @see Output
8
+ class ScreenOutput < Output
9
+ # @!visibility private
10
+
11
+ include PageMe
12
+
13
+ def initialize
14
+ @tables = []
15
+ @rowcount = 0
16
+ end
17
+
18
+ def close
19
+ page do |p|
20
+ p.puts "Showing all #{@tables.length} rows:\n\n"
21
+ @tables.each do |table|
22
+ p.puts table
23
+ end
24
+ end
25
+ end
26
+
27
+ def write_row(row)
28
+ out = []
29
+ 0.upto(@header.length - 1) do |counter|
30
+ out << [@header[counter], row[counter]]
31
+ end
32
+ @tables << Terminal::Table.new(rows: out)
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,28 @@
1
+ require 'axlsx'
2
+ module DocParser
3
+ # The XLSXOutput class generates Microsoft Excel compatible .xlsx files
4
+ # using the great axslx library
5
+ # @see Output
6
+ class XLSXOutput < Output
7
+ # @!visibility private
8
+ def open_file
9
+ @package = Axlsx::Package.new
10
+ @package.workbook.date1904 = false # Fix for OS X
11
+ @sheet = @package.workbook.add_worksheet
12
+ @file.close
13
+ end
14
+
15
+ def header
16
+ write_row @header
17
+ end
18
+
19
+ def write_row(row)
20
+ @sheet.add_row row
21
+ end
22
+
23
+ def close
24
+ @sheet.add_table "A1:#{@sheet.cells.last.r}", name: 'Data'
25
+ @package.serialize @filename
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,16 @@
1
+ require 'yaml'
2
+ module DocParser
3
+ # The YAMLOutput class generates a YAML file containing all rows as seperate
4
+ # YAML documents
5
+ # @see Output
6
+ class YAMLOutput < Output
7
+ # @!visibility private
8
+ def write_row(row)
9
+ @doc ||= {}
10
+ 0.upto(@header.length - 1) do |counter|
11
+ @doc[@header[counter]] = row[counter] rescue ''
12
+ end
13
+ YAML.dump @doc, @file
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,105 @@
1
+ $:.unshift __dir__
2
+ require 'rubygems'
3
+ require 'bundler/setup'
4
+ require 'version'
5
+ require 'output'
6
+ require 'document'
7
+ require 'nokogiri'
8
+ require 'open-uri'
9
+ require 'parallel'
10
+ require 'set'
11
+ require 'output/screen_output.rb'
12
+ require 'output/csv_output.rb'
13
+ require 'output/html_output.rb'
14
+ require 'output/xlsx_output.rb'
15
+ require 'output/yaml_output.rb'
16
+ require 'output/json_output.rb'
17
+ require 'output/multi_output.rb'
18
+ # {include:file:README.md}
19
+ module DocParser
20
+ # The main parser class. This is the class you'll use to create your parser
21
+ # The real work happens in the Document class
22
+ # @see Document
23
+ class Parser
24
+ # @!visibility private
25
+ attr_reader :outputs
26
+
27
+ # Creates a new parser instance
28
+ # @param files [Array] An array containing URLs or paths to files
29
+ # @param quiet [Boolean] Be quiet
30
+ # @param encoding [String] The encoding to use for opening the files
31
+ # @param parallel [Boolean] Use parallel processing
32
+ # @param output [Output, Array] The output(s), defaults to a Screenoutput
33
+ # @param range [Range] Range of files to process (nil means process all)
34
+ # @param num_processes [Fixnum] Number of parallel processes
35
+ def initialize(files: [], quiet: false, encoding: 'utf-8', parallel: true,
36
+ output: ScreenOutput.new, range: nil,
37
+ num_processes: Parallel.processor_count + 1)
38
+ @quiet = quiet
39
+ @parallel = parallel
40
+ @num_processes = num_processes
41
+ @encoding = encoding
42
+ if output.is_a? Output
43
+ @outputs = []
44
+ @outputs << output
45
+ elsif output.is_a?(Array) && output.all? { |o| o.is_a? Output }
46
+ @outputs = output
47
+ else
48
+ raise ArgumentError, 'No outputs specified'
49
+ end
50
+ @files = if range
51
+ files[range]
52
+ else
53
+ files
54
+ end
55
+ log 'DocParser loaded..'
56
+ log "#{@files.length} files loaded (encoding: #{@encoding})"
57
+ end
58
+
59
+ #
60
+ # Parses the `files`
61
+ #
62
+ def parse!(&block)
63
+ log "Parsing #{@files.length} files."
64
+ start_time = Time.now
65
+ resultsets = Array.new(@outputs.length) { Set.new }
66
+
67
+ if @parallel && @num_processes > 1
68
+ log "Starting #{@num_processes} processes"
69
+ Parallel.map(@files, in_processes: @num_processes) do |file|
70
+ Document.new(file, encoding: @encoding, parser: self).parse!(&block)
71
+ end.each do |result|
72
+ result.each_with_index { |set, index| resultsets[index].merge(set) }
73
+ end
74
+ log 'Parallel processing finished, writing results..'
75
+ else
76
+ @files.each do |file|
77
+ doc = Document.new(file, encoding: @encoding, parser: self)
78
+ doc.parse!(&block).each_with_index do |set, index|
79
+ resultsets[index].merge(set)
80
+ end
81
+ end
82
+ end
83
+
84
+ log "\nSummary\n======="
85
+
86
+ @outputs.each_with_index do |output, index|
87
+ resultsets[index].each do |row|
88
+ output.add_row row
89
+ end
90
+ resultsets[index] = nil
91
+ output.close
92
+ log output.summary
93
+ end
94
+
95
+ log ''
96
+ log 'Done processing in %.2fs.' % (Time.now - start_time)
97
+ end
98
+
99
+ private
100
+
101
+ def log(str)
102
+ puts str unless @quiet
103
+ end
104
+ end
105
+ end
@@ -0,0 +1,4 @@
1
+ module DocParser
2
+ # The current version of DocParser
3
+ VERSION = '0.0.1'
4
+ end
metadata ADDED
@@ -0,0 +1,93 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: docparser
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Jurriaan Pruis
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-04-11 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: '1.3'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: '1.3'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ description: DocParser is a Ruby Gem for webscraping
42
+ email:
43
+ - email@jurriaanpruis.nl
44
+ executables: []
45
+ extensions: []
46
+ extra_rdoc_files: []
47
+ files:
48
+ - .gitignore
49
+ - .yardops
50
+ - Gemfile
51
+ - LICENSE
52
+ - README.md
53
+ - Rakefile
54
+ - docparser.gemspec
55
+ - example.rb
56
+ - lib/docparser.rb
57
+ - lib/docparser/document.rb
58
+ - lib/docparser/output.rb
59
+ - lib/docparser/output/csv_output.rb
60
+ - lib/docparser/output/html_output.rb
61
+ - lib/docparser/output/json_output.rb
62
+ - lib/docparser/output/multi_output.rb
63
+ - lib/docparser/output/screen_output.rb
64
+ - lib/docparser/output/xlsx_output.rb
65
+ - lib/docparser/output/yaml_output.rb
66
+ - lib/docparser/parser.rb
67
+ - lib/docparser/version.rb
68
+ homepage: https://github.com/jurriaan/docparser
69
+ licenses:
70
+ - MIT
71
+ metadata: {}
72
+ post_install_message:
73
+ rdoc_options: []
74
+ require_paths:
75
+ - lib
76
+ required_ruby_version: !ruby/object:Gem::Requirement
77
+ requirements:
78
+ - - '>='
79
+ - !ruby/object:Gem::Version
80
+ version: '0'
81
+ required_rubygems_version: !ruby/object:Gem::Requirement
82
+ requirements:
83
+ - - '>='
84
+ - !ruby/object:Gem::Version
85
+ version: '0'
86
+ requirements: []
87
+ rubyforge_project:
88
+ rubygems_version: 2.0.3
89
+ signing_key:
90
+ specification_version: 4
91
+ summary: DocParser is a Ruby Gem for webscraping
92
+ test_files: []
93
+ has_rdoc: