docparser 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +18 -0
- data/.yardops +2 -0
- data/Gemfile +9 -0
- data/LICENSE +22 -0
- data/README.md +52 -0
- data/Rakefile +1 -0
- data/docparser.gemspec +23 -0
- data/example.rb +22 -0
- data/lib/docparser.rb +1 -0
- data/lib/docparser/document.rb +79 -0
- data/lib/docparser/output.rb +62 -0
- data/lib/docparser/output/csv_output.rb +20 -0
- data/lib/docparser/output/html_output.rb +78 -0
- data/lib/docparser/output/json_output.rb +30 -0
- data/lib/docparser/output/multi_output.rb +50 -0
- data/lib/docparser/output/screen_output.rb +35 -0
- data/lib/docparser/output/xlsx_output.rb +28 -0
- data/lib/docparser/output/yaml_output.rb +16 -0
- data/lib/docparser/parser.rb +105 -0
- data/lib/docparser/version.rb +4 -0
- metadata +93 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 829c9585c7ac42c2496a8b385f2f19d4486a5e10
|
4
|
+
data.tar.gz: 39f576740941b72e61babec808af6756adfeb1b8
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 32a9dc03bb9f413641b25cb0e51c9951fa8e1a64a48f4b1c8cc8e1e3877cbaa87b1ac70738f85c20f7383a3446d5a37915ee53a09fd30099b00be553779f4f4a
|
7
|
+
data.tar.gz: b106a3fc8843a2d47f9d8c4f5ac65e39afc702fff8dcabfdfe89e0477731ac1c9576e9b795dcda53c927c8acc718846b2bb277a0e7e68654ead52cb97e612b3e
|
data/.gitignore
ADDED
data/.yardops
ADDED
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 Jurriaan Pruis
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,52 @@
|
|
1
|
+
# DocParser
|
2
|
+
|
3
|
+
Docs: http://rubydoc.info/github/jurriaan/docparser/
|
4
|
+
|
5
|
+
DocParser is a web scraping/screen scraping tool.
|
6
|
+
You can use it to easily scrape web sites.
|
7
|
+
|
8
|
+
## Features
|
9
|
+
|
10
|
+
- XPath and CSS support through Nokogiri
|
11
|
+
- Support for loading of URLs throug open-uri
|
12
|
+
- Support for parallel processing of the documents
|
13
|
+
- 5 Output formats:
|
14
|
+
* CSV
|
15
|
+
* XLSX
|
16
|
+
* HTML
|
17
|
+
* YAML
|
18
|
+
* Screen (for debugging and development)
|
19
|
+
|
20
|
+
## Installation
|
21
|
+
|
22
|
+
Add this line to your application's Gemfile:
|
23
|
+
|
24
|
+
gem 'docparser'
|
25
|
+
|
26
|
+
And then execute:
|
27
|
+
|
28
|
+
$ bundle
|
29
|
+
|
30
|
+
Or install it yourself as:
|
31
|
+
|
32
|
+
$ gem install docparser
|
33
|
+
|
34
|
+
## Usage
|
35
|
+
|
36
|
+
See example.rb
|
37
|
+
|
38
|
+
## Todo
|
39
|
+
|
40
|
+
- Tests
|
41
|
+
|
42
|
+
## Contributing
|
43
|
+
|
44
|
+
1. Fork it
|
45
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
46
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
47
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
48
|
+
5. Create new Pull Request
|
49
|
+
|
50
|
+
## Contributors
|
51
|
+
|
52
|
+
- Jurriaan Pruis
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
data/docparser.gemspec
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'docparser/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "docparser"
|
8
|
+
spec.version = DocParser::VERSION
|
9
|
+
spec.authors = ["Jurriaan Pruis"]
|
10
|
+
spec.email = ["email@jurriaanpruis.nl"]
|
11
|
+
spec.description = %q{DocParser is a Ruby Gem for webscraping}
|
12
|
+
spec.summary = %q{DocParser is a Ruby Gem for webscraping}
|
13
|
+
spec.homepage = "https://github.com/jurriaan/docparser"
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files`.split($/)
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_development_dependency "bundler", "~> 1.3"
|
22
|
+
spec.add_development_dependency "rake"
|
23
|
+
end
|
data/example.rb
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
#
|
2
|
+
|
3
|
+
# An example of parsing a popular dutch website..
|
4
|
+
# (C) 2013 Jurriaan Pruis
|
5
|
+
#
|
6
|
+
|
7
|
+
require 'docparser'
|
8
|
+
include DocParser
|
9
|
+
output = HTMLOutput.new filename: 'hackaday.html'
|
10
|
+
output.header = 'Title', 'Author', 'Publication date', 'URL', 'Summary'
|
11
|
+
parser = Parser.new(files: (1..20).map {|i| "http://hackaday.com/page/#{i}/"}, parallel: false, output: output)
|
12
|
+
parser.parse! do
|
13
|
+
css('#content .post') do |post|
|
14
|
+
title_el = post.search('.entry-title a').first
|
15
|
+
title = title_el.content
|
16
|
+
author =post.search('.post-info .author .fn a').first.content
|
17
|
+
published_time = post.search('.post-info .date.published').first.content
|
18
|
+
url = title_el.attributes['href']
|
19
|
+
summary = post.search('.entry-content').first.content.strip
|
20
|
+
add_row title, author, published_time, url, summary
|
21
|
+
end
|
22
|
+
end
|
data/lib/docparser.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require 'docparser/parser'
|
@@ -0,0 +1,79 @@
|
|
1
|
+
require 'set'
|
2
|
+
module DocParser
|
3
|
+
# The Document class loads and parses the files.
|
4
|
+
# @see Parser
|
5
|
+
# @see Output
|
6
|
+
class Document
|
7
|
+
attr_reader :filename, :doc, :encoding, :results
|
8
|
+
def initialize(filename, encoding: 'utf-8', parser: nil)
|
9
|
+
if encoding == 'utf-8'
|
10
|
+
encodingstring = 'r:utf-8'
|
11
|
+
else
|
12
|
+
encodingstring = "r:#{encoding}:utf-8"
|
13
|
+
end
|
14
|
+
|
15
|
+
open(filename, encodingstring) do |f|
|
16
|
+
@doc = Nokogiri::HTML(f)
|
17
|
+
end
|
18
|
+
|
19
|
+
@encoding = encoding
|
20
|
+
@parser = parser
|
21
|
+
@filename = filename
|
22
|
+
@results = Array.new(@parser.outputs.length) { [] }
|
23
|
+
end
|
24
|
+
|
25
|
+
# Adds a row to an output
|
26
|
+
def add_row(*row, output: 0)
|
27
|
+
output = @parser.outputs.index(output) if output.is_a? Output
|
28
|
+
results[output] << row.flatten
|
29
|
+
end
|
30
|
+
|
31
|
+
# Extracts the document title
|
32
|
+
# @return [String] the title of the document
|
33
|
+
def title
|
34
|
+
@title ||= xpath_content('//head/title')
|
35
|
+
end
|
36
|
+
|
37
|
+
# @return [String] the source of the document
|
38
|
+
def html
|
39
|
+
@html ||= @doc.inner_html #TODO: ??
|
40
|
+
end
|
41
|
+
|
42
|
+
# Executes a xpath query
|
43
|
+
def xpath(query)
|
44
|
+
res = @doc.search(query)
|
45
|
+
res.each { |el| yield el } if block_given?
|
46
|
+
end
|
47
|
+
|
48
|
+
# Executes a xpath query and returns the content
|
49
|
+
# @return [String] the content of the HTML node
|
50
|
+
def xpath_content(query)
|
51
|
+
first = @doc.search(query).first
|
52
|
+
if first.nil?
|
53
|
+
nil
|
54
|
+
else
|
55
|
+
first.content
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
# Matches the HTML source using a regular expression
|
60
|
+
def regexp(regexp)
|
61
|
+
html.match(regexp) rescue nil
|
62
|
+
end
|
63
|
+
|
64
|
+
# Parses the document
|
65
|
+
# @return [Array] containing the parse results
|
66
|
+
def parse!(&block)
|
67
|
+
instance_exec(&block)
|
68
|
+
results
|
69
|
+
end
|
70
|
+
|
71
|
+
# @!visibility private
|
72
|
+
def inspect
|
73
|
+
"<Document file:'#{@filename}'>"
|
74
|
+
end
|
75
|
+
|
76
|
+
alias :css :xpath
|
77
|
+
alias :css_content :xpath_content
|
78
|
+
end
|
79
|
+
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
module DocParser
|
2
|
+
# The Output base class.
|
3
|
+
# All Output classes inherit from this one.
|
4
|
+
class Output
|
5
|
+
attr_reader :rowcount
|
6
|
+
|
7
|
+
# Creates a new output
|
8
|
+
# @param filename [String] Output filename
|
9
|
+
def initialize(filename: filename)
|
10
|
+
@rowcount = 0
|
11
|
+
@filename = filename
|
12
|
+
raise ArgumentError, 'Please specify a filename' if filename.empty?
|
13
|
+
@file = open filename, 'w'
|
14
|
+
open_file
|
15
|
+
end
|
16
|
+
|
17
|
+
# Stores the header
|
18
|
+
def header=(row)
|
19
|
+
@header = row
|
20
|
+
header
|
21
|
+
end
|
22
|
+
|
23
|
+
# Adds a row
|
24
|
+
def add_row(row)
|
25
|
+
@rowcount += 1
|
26
|
+
write_row row
|
27
|
+
end
|
28
|
+
|
29
|
+
# Closes output and IO
|
30
|
+
def close
|
31
|
+
footer
|
32
|
+
@file.close unless @file.closed?
|
33
|
+
end
|
34
|
+
|
35
|
+
# Called after the file is opened
|
36
|
+
def open_file
|
37
|
+
# do nothing
|
38
|
+
end
|
39
|
+
|
40
|
+
# Called after header is set
|
41
|
+
def header
|
42
|
+
# do nothing
|
43
|
+
end
|
44
|
+
|
45
|
+
# Called when a row is added
|
46
|
+
def write_row(row)
|
47
|
+
raise 'No row writer defined'
|
48
|
+
end
|
49
|
+
|
50
|
+
# Called before closing the file
|
51
|
+
def footer
|
52
|
+
end
|
53
|
+
|
54
|
+
# Displays information about the output
|
55
|
+
# @return [String] containing number of rows and file size
|
56
|
+
def summary
|
57
|
+
"%s:\t%d rows, %9.2f KiB" % [@filename,
|
58
|
+
@rowcount,
|
59
|
+
File.size(@filename) / 1024.0]
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
require 'csv'
|
2
|
+
module DocParser
|
3
|
+
# The CSVOutput class generates a CSV file containing all rows
|
4
|
+
# @see Output
|
5
|
+
|
6
|
+
class CSVOutput < Output
|
7
|
+
# @!visibility private
|
8
|
+
def open_file
|
9
|
+
@csv = CSV.new(@file, col_sep: ';')
|
10
|
+
end
|
11
|
+
|
12
|
+
def header
|
13
|
+
write_row @header
|
14
|
+
end
|
15
|
+
|
16
|
+
def write_row(row)
|
17
|
+
@csv << row
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,78 @@
|
|
1
|
+
require 'cgi'
|
2
|
+
module DocParser
|
3
|
+
# The XLSXOutput class generates an HTML file containing a table
|
4
|
+
# @see Output
|
5
|
+
class HTMLOutput < Output
|
6
|
+
# @!visibility private
|
7
|
+
HTMLHEADER = <<-EOS
|
8
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
|
9
|
+
"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
|
10
|
+
<html>
|
11
|
+
<head>
|
12
|
+
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
|
13
|
+
<title>HTML output "#FILENAME#"</title>
|
14
|
+
<style type="text/css">
|
15
|
+
body {
|
16
|
+
font-family:"Helvetica Neue", Helvetica, Sans-Serif;
|
17
|
+
font-size:12px;
|
18
|
+
}
|
19
|
+
table {
|
20
|
+
border:1px solid #69c;
|
21
|
+
border-collapse:collapse;
|
22
|
+
font-size:12px;
|
23
|
+
text-align:left;
|
24
|
+
width:480px;
|
25
|
+
}
|
26
|
+
th {
|
27
|
+
border-bottom:1px dashed #69c;
|
28
|
+
color:#039;
|
29
|
+
font-size:14px;
|
30
|
+
font-weight:normal;
|
31
|
+
padding:12px 17px;
|
32
|
+
}
|
33
|
+
td {
|
34
|
+
color:#669;
|
35
|
+
padding:7px 17px;
|
36
|
+
white-space: pre;
|
37
|
+
}
|
38
|
+
tbody tr:hover td {
|
39
|
+
background:#d0dafd;
|
40
|
+
color:#339;
|
41
|
+
}
|
42
|
+
tbody tr:nth-child(even) {
|
43
|
+
background:#e0eaff;
|
44
|
+
}
|
45
|
+
</style>
|
46
|
+
</head>
|
47
|
+
<body>
|
48
|
+
<table>
|
49
|
+
EOS
|
50
|
+
# @!visibility private
|
51
|
+
HTMLFOOTER = <<-EOS
|
52
|
+
</tbody>
|
53
|
+
</table>
|
54
|
+
<p>#COUNT# rows</p>
|
55
|
+
</body>
|
56
|
+
</html>
|
57
|
+
EOS
|
58
|
+
def open_file
|
59
|
+
@file << HTMLHEADER.gsub('#FILENAME#', @filename)
|
60
|
+
end
|
61
|
+
|
62
|
+
def header
|
63
|
+
@file << '<thead><tr>'
|
64
|
+
@file << @header.map { |f| '<th>' + f + '</th>' }.join
|
65
|
+
@file << "</tr></thead>\n<tbody>\n"
|
66
|
+
end
|
67
|
+
|
68
|
+
def write_row(row)
|
69
|
+
@file << '<tr>'
|
70
|
+
@file << row.map { |f| '<td>' + CGI.escapeHTML(f.to_s) + '</td>' }.join
|
71
|
+
@file << "</tr>\n"
|
72
|
+
end
|
73
|
+
|
74
|
+
def footer
|
75
|
+
@file << HTMLFOOTER.gsub('#COUNT#', @rowcount.to_s)
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
require 'json'
|
2
|
+
module DocParser
|
3
|
+
# The JSONOutput class generates a JSON file containing all rows as seperate
|
4
|
+
# JSON documents
|
5
|
+
# @see Output
|
6
|
+
class JSONOutput < Output
|
7
|
+
# @!visibility private
|
8
|
+
def open_file
|
9
|
+
@file << '['
|
10
|
+
@first = true
|
11
|
+
@doc = {}
|
12
|
+
end
|
13
|
+
|
14
|
+
def write_row(row)
|
15
|
+
if @first
|
16
|
+
@first = false
|
17
|
+
else
|
18
|
+
@file << ','
|
19
|
+
end
|
20
|
+
0.upto(@header.length - 1) do |counter|
|
21
|
+
@doc[@header[counter]] = row[counter] rescue ''
|
22
|
+
end
|
23
|
+
@file << JSON.dump(@doc)
|
24
|
+
end
|
25
|
+
|
26
|
+
def close
|
27
|
+
@file << ']'
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
module DocParser
|
2
|
+
# The MultiOutput output combines multiple outputs.
|
3
|
+
# It creates a CSV, HTML, YAML and XLSX Output file
|
4
|
+
# @see CSVOutput
|
5
|
+
# @see HTMLOutput
|
6
|
+
# @see YAMLOutput
|
7
|
+
# @see XLSXOutput
|
8
|
+
# @see Output
|
9
|
+
class MultiOutput < Output
|
10
|
+
# @!visibility private
|
11
|
+
def initialize(**options)
|
12
|
+
@outputs = []
|
13
|
+
csvoptions = options.clone
|
14
|
+
csvoptions[:filename] += '.csv'
|
15
|
+
htmloptions = options.clone
|
16
|
+
htmloptions[:filename] += '.html'
|
17
|
+
yamloptions = options.clone
|
18
|
+
yamloptions[:filename] += '.yml'
|
19
|
+
xlsxoptions = options.clone
|
20
|
+
xlsxoptions[:filename] += '.xlsx'
|
21
|
+
jsonoptions = options.clone
|
22
|
+
jsonoptions[:filename] += '.json'
|
23
|
+
@outputs << CSVOutput.new(csvoptions)
|
24
|
+
@outputs << HTMLOutput.new(htmloptions)
|
25
|
+
@outputs << YAMLOutput.new(yamloptions)
|
26
|
+
@outputs << XLSXOutput.new(xlsxoptions)
|
27
|
+
@outputs << XLSXOutput.new(jsonoptions)
|
28
|
+
end
|
29
|
+
|
30
|
+
def header=(row)
|
31
|
+
@outputs.each { |out| out.header = row.flatten }
|
32
|
+
end
|
33
|
+
|
34
|
+
def add_row(row)
|
35
|
+
@outputs.each { |out| out.add_row row.flatten }
|
36
|
+
end
|
37
|
+
|
38
|
+
def rowcount
|
39
|
+
@outputs.min { |out| out.rowcount }.rowcount
|
40
|
+
end
|
41
|
+
|
42
|
+
def close
|
43
|
+
@outputs.each { |out| out.close }
|
44
|
+
end
|
45
|
+
|
46
|
+
def summary
|
47
|
+
@outputs.map { |out| out.summary }.join("\n")
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'terminal-table'
|
2
|
+
require 'pageme'
|
3
|
+
module DocParser
|
4
|
+
# This Output is can be used for debugging purposes.
|
5
|
+
|
6
|
+
# It pipes all rows through a pager
|
7
|
+
# @see Output
|
8
|
+
class ScreenOutput < Output
|
9
|
+
# @!visibility private
|
10
|
+
|
11
|
+
include PageMe
|
12
|
+
|
13
|
+
def initialize
|
14
|
+
@tables = []
|
15
|
+
@rowcount = 0
|
16
|
+
end
|
17
|
+
|
18
|
+
def close
|
19
|
+
page do |p|
|
20
|
+
p.puts "Showing all #{@tables.length} rows:\n\n"
|
21
|
+
@tables.each do |table|
|
22
|
+
p.puts table
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def write_row(row)
|
28
|
+
out = []
|
29
|
+
0.upto(@header.length - 1) do |counter|
|
30
|
+
out << [@header[counter], row[counter]]
|
31
|
+
end
|
32
|
+
@tables << Terminal::Table.new(rows: out)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'axlsx'
|
2
|
+
module DocParser
|
3
|
+
# The XLSXOutput class generates Microsoft Excel compatible .xlsx files
|
4
|
+
# using the great axslx library
|
5
|
+
# @see Output
|
6
|
+
class XLSXOutput < Output
|
7
|
+
# @!visibility private
|
8
|
+
def open_file
|
9
|
+
@package = Axlsx::Package.new
|
10
|
+
@package.workbook.date1904 = false # Fix for OS X
|
11
|
+
@sheet = @package.workbook.add_worksheet
|
12
|
+
@file.close
|
13
|
+
end
|
14
|
+
|
15
|
+
def header
|
16
|
+
write_row @header
|
17
|
+
end
|
18
|
+
|
19
|
+
def write_row(row)
|
20
|
+
@sheet.add_row row
|
21
|
+
end
|
22
|
+
|
23
|
+
def close
|
24
|
+
@sheet.add_table "A1:#{@sheet.cells.last.r}", name: 'Data'
|
25
|
+
@package.serialize @filename
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
require 'yaml'
|
2
|
+
module DocParser
|
3
|
+
# The YAMLOutput class generates a YAML file containing all rows as seperate
|
4
|
+
# YAML documents
|
5
|
+
# @see Output
|
6
|
+
class YAMLOutput < Output
|
7
|
+
# @!visibility private
|
8
|
+
def write_row(row)
|
9
|
+
@doc ||= {}
|
10
|
+
0.upto(@header.length - 1) do |counter|
|
11
|
+
@doc[@header[counter]] = row[counter] rescue ''
|
12
|
+
end
|
13
|
+
YAML.dump @doc, @file
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,105 @@
|
|
1
|
+
$:.unshift __dir__
|
2
|
+
require 'rubygems'
|
3
|
+
require 'bundler/setup'
|
4
|
+
require 'version'
|
5
|
+
require 'output'
|
6
|
+
require 'document'
|
7
|
+
require 'nokogiri'
|
8
|
+
require 'open-uri'
|
9
|
+
require 'parallel'
|
10
|
+
require 'set'
|
11
|
+
require 'output/screen_output.rb'
|
12
|
+
require 'output/csv_output.rb'
|
13
|
+
require 'output/html_output.rb'
|
14
|
+
require 'output/xlsx_output.rb'
|
15
|
+
require 'output/yaml_output.rb'
|
16
|
+
require 'output/json_output.rb'
|
17
|
+
require 'output/multi_output.rb'
|
18
|
+
# {include:file:README.md}
|
19
|
+
module DocParser
|
20
|
+
# The main parser class. This is the class you'll use to create your parser
|
21
|
+
# The real work happens in the Document class
|
22
|
+
# @see Document
|
23
|
+
class Parser
|
24
|
+
# @!visibility private
|
25
|
+
attr_reader :outputs
|
26
|
+
|
27
|
+
# Creates a new parser instance
|
28
|
+
# @param files [Array] An array containing URLs or paths to files
|
29
|
+
# @param quiet [Boolean] Be quiet
|
30
|
+
# @param encoding [String] The encoding to use for opening the files
|
31
|
+
# @param parallel [Boolean] Use parallel processing
|
32
|
+
# @param output [Output, Array] The output(s), defaults to a Screenoutput
|
33
|
+
# @param range [Range] Range of files to process (nil means process all)
|
34
|
+
# @param num_processes [Fixnum] Number of parallel processes
|
35
|
+
def initialize(files: [], quiet: false, encoding: 'utf-8', parallel: true,
|
36
|
+
output: ScreenOutput.new, range: nil,
|
37
|
+
num_processes: Parallel.processor_count + 1)
|
38
|
+
@quiet = quiet
|
39
|
+
@parallel = parallel
|
40
|
+
@num_processes = num_processes
|
41
|
+
@encoding = encoding
|
42
|
+
if output.is_a? Output
|
43
|
+
@outputs = []
|
44
|
+
@outputs << output
|
45
|
+
elsif output.is_a?(Array) && output.all? { |o| o.is_a? Output }
|
46
|
+
@outputs = output
|
47
|
+
else
|
48
|
+
raise ArgumentError, 'No outputs specified'
|
49
|
+
end
|
50
|
+
@files = if range
|
51
|
+
files[range]
|
52
|
+
else
|
53
|
+
files
|
54
|
+
end
|
55
|
+
log 'DocParser loaded..'
|
56
|
+
log "#{@files.length} files loaded (encoding: #{@encoding})"
|
57
|
+
end
|
58
|
+
|
59
|
+
#
|
60
|
+
# Parses the `files`
|
61
|
+
#
|
62
|
+
def parse!(&block)
|
63
|
+
log "Parsing #{@files.length} files."
|
64
|
+
start_time = Time.now
|
65
|
+
resultsets = Array.new(@outputs.length) { Set.new }
|
66
|
+
|
67
|
+
if @parallel && @num_processes > 1
|
68
|
+
log "Starting #{@num_processes} processes"
|
69
|
+
Parallel.map(@files, in_processes: @num_processes) do |file|
|
70
|
+
Document.new(file, encoding: @encoding, parser: self).parse!(&block)
|
71
|
+
end.each do |result|
|
72
|
+
result.each_with_index { |set, index| resultsets[index].merge(set) }
|
73
|
+
end
|
74
|
+
log 'Parallel processing finished, writing results..'
|
75
|
+
else
|
76
|
+
@files.each do |file|
|
77
|
+
doc = Document.new(file, encoding: @encoding, parser: self)
|
78
|
+
doc.parse!(&block).each_with_index do |set, index|
|
79
|
+
resultsets[index].merge(set)
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
log "\nSummary\n======="
|
85
|
+
|
86
|
+
@outputs.each_with_index do |output, index|
|
87
|
+
resultsets[index].each do |row|
|
88
|
+
output.add_row row
|
89
|
+
end
|
90
|
+
resultsets[index] = nil
|
91
|
+
output.close
|
92
|
+
log output.summary
|
93
|
+
end
|
94
|
+
|
95
|
+
log ''
|
96
|
+
log 'Done processing in %.2fs.' % (Time.now - start_time)
|
97
|
+
end
|
98
|
+
|
99
|
+
private
|
100
|
+
|
101
|
+
def log(str)
|
102
|
+
puts str unless @quiet
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
metadata
ADDED
@@ -0,0 +1,93 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: docparser
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Jurriaan Pruis
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2013-04-11 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ~>
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.3'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ~>
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.3'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - '>='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
description: DocParser is a Ruby Gem for webscraping
|
42
|
+
email:
|
43
|
+
- email@jurriaanpruis.nl
|
44
|
+
executables: []
|
45
|
+
extensions: []
|
46
|
+
extra_rdoc_files: []
|
47
|
+
files:
|
48
|
+
- .gitignore
|
49
|
+
- .yardops
|
50
|
+
- Gemfile
|
51
|
+
- LICENSE
|
52
|
+
- README.md
|
53
|
+
- Rakefile
|
54
|
+
- docparser.gemspec
|
55
|
+
- example.rb
|
56
|
+
- lib/docparser.rb
|
57
|
+
- lib/docparser/document.rb
|
58
|
+
- lib/docparser/output.rb
|
59
|
+
- lib/docparser/output/csv_output.rb
|
60
|
+
- lib/docparser/output/html_output.rb
|
61
|
+
- lib/docparser/output/json_output.rb
|
62
|
+
- lib/docparser/output/multi_output.rb
|
63
|
+
- lib/docparser/output/screen_output.rb
|
64
|
+
- lib/docparser/output/xlsx_output.rb
|
65
|
+
- lib/docparser/output/yaml_output.rb
|
66
|
+
- lib/docparser/parser.rb
|
67
|
+
- lib/docparser/version.rb
|
68
|
+
homepage: https://github.com/jurriaan/docparser
|
69
|
+
licenses:
|
70
|
+
- MIT
|
71
|
+
metadata: {}
|
72
|
+
post_install_message:
|
73
|
+
rdoc_options: []
|
74
|
+
require_paths:
|
75
|
+
- lib
|
76
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
77
|
+
requirements:
|
78
|
+
- - '>='
|
79
|
+
- !ruby/object:Gem::Version
|
80
|
+
version: '0'
|
81
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
82
|
+
requirements:
|
83
|
+
- - '>='
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
version: '0'
|
86
|
+
requirements: []
|
87
|
+
rubyforge_project:
|
88
|
+
rubygems_version: 2.0.3
|
89
|
+
signing_key:
|
90
|
+
specification_version: 4
|
91
|
+
summary: DocParser is a Ruby Gem for webscraping
|
92
|
+
test_files: []
|
93
|
+
has_rdoc:
|