docparser 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +18 -0
- data/.yardops +2 -0
- data/Gemfile +9 -0
- data/LICENSE +22 -0
- data/README.md +52 -0
- data/Rakefile +1 -0
- data/docparser.gemspec +23 -0
- data/example.rb +22 -0
- data/lib/docparser.rb +1 -0
- data/lib/docparser/document.rb +79 -0
- data/lib/docparser/output.rb +62 -0
- data/lib/docparser/output/csv_output.rb +20 -0
- data/lib/docparser/output/html_output.rb +78 -0
- data/lib/docparser/output/json_output.rb +30 -0
- data/lib/docparser/output/multi_output.rb +50 -0
- data/lib/docparser/output/screen_output.rb +35 -0
- data/lib/docparser/output/xlsx_output.rb +28 -0
- data/lib/docparser/output/yaml_output.rb +16 -0
- data/lib/docparser/parser.rb +105 -0
- data/lib/docparser/version.rb +4 -0
- metadata +93 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 829c9585c7ac42c2496a8b385f2f19d4486a5e10
|
4
|
+
data.tar.gz: 39f576740941b72e61babec808af6756adfeb1b8
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 32a9dc03bb9f413641b25cb0e51c9951fa8e1a64a48f4b1c8cc8e1e3877cbaa87b1ac70738f85c20f7383a3446d5a37915ee53a09fd30099b00be553779f4f4a
|
7
|
+
data.tar.gz: b106a3fc8843a2d47f9d8c4f5ac65e39afc702fff8dcabfdfe89e0477731ac1c9576e9b795dcda53c927c8acc718846b2bb277a0e7e68654ead52cb97e612b3e
|
data/.gitignore
ADDED
data/.yardops
ADDED
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 Jurriaan Pruis
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,52 @@
|
|
1
|
+
# DocParser
|
2
|
+
|
3
|
+
Docs: http://rubydoc.info/github/jurriaan/docparser/
|
4
|
+
|
5
|
+
DocParser is a web scraping/screen scraping tool.
|
6
|
+
You can use it to easily scrape web sites.
|
7
|
+
|
8
|
+
## Features
|
9
|
+
|
10
|
+
- XPath and CSS support through Nokogiri
|
11
|
+
- Support for loading of URLs throug open-uri
|
12
|
+
- Support for parallel processing of the documents
|
13
|
+
- 5 Output formats:
|
14
|
+
* CSV
|
15
|
+
* XLSX
|
16
|
+
* HTML
|
17
|
+
* YAML
|
18
|
+
* Screen (for debugging and development)
|
19
|
+
|
20
|
+
## Installation
|
21
|
+
|
22
|
+
Add this line to your application's Gemfile:
|
23
|
+
|
24
|
+
gem 'docparser'
|
25
|
+
|
26
|
+
And then execute:
|
27
|
+
|
28
|
+
$ bundle
|
29
|
+
|
30
|
+
Or install it yourself as:
|
31
|
+
|
32
|
+
$ gem install docparser
|
33
|
+
|
34
|
+
## Usage
|
35
|
+
|
36
|
+
See example.rb
|
37
|
+
|
38
|
+
## Todo
|
39
|
+
|
40
|
+
- Tests
|
41
|
+
|
42
|
+
## Contributing
|
43
|
+
|
44
|
+
1. Fork it
|
45
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
46
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
47
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
48
|
+
5. Create new Pull Request
|
49
|
+
|
50
|
+
## Contributors
|
51
|
+
|
52
|
+
- Jurriaan Pruis
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
data/docparser.gemspec
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'docparser/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "docparser"
|
8
|
+
spec.version = DocParser::VERSION
|
9
|
+
spec.authors = ["Jurriaan Pruis"]
|
10
|
+
spec.email = ["email@jurriaanpruis.nl"]
|
11
|
+
spec.description = %q{DocParser is a Ruby Gem for webscraping}
|
12
|
+
spec.summary = %q{DocParser is a Ruby Gem for webscraping}
|
13
|
+
spec.homepage = "https://github.com/jurriaan/docparser"
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files`.split($/)
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_development_dependency "bundler", "~> 1.3"
|
22
|
+
spec.add_development_dependency "rake"
|
23
|
+
end
|
data/example.rb
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
#
|
2
|
+
|
3
|
+
# An example of parsing a popular dutch website..
|
4
|
+
# (C) 2013 Jurriaan Pruis
|
5
|
+
#
|
6
|
+
|
7
|
+
require 'docparser'
|
8
|
+
include DocParser
|
9
|
+
output = HTMLOutput.new filename: 'hackaday.html'
|
10
|
+
output.header = 'Title', 'Author', 'Publication date', 'URL', 'Summary'
|
11
|
+
parser = Parser.new(files: (1..20).map {|i| "http://hackaday.com/page/#{i}/"}, parallel: false, output: output)
|
12
|
+
parser.parse! do
|
13
|
+
css('#content .post') do |post|
|
14
|
+
title_el = post.search('.entry-title a').first
|
15
|
+
title = title_el.content
|
16
|
+
author =post.search('.post-info .author .fn a').first.content
|
17
|
+
published_time = post.search('.post-info .date.published').first.content
|
18
|
+
url = title_el.attributes['href']
|
19
|
+
summary = post.search('.entry-content').first.content.strip
|
20
|
+
add_row title, author, published_time, url, summary
|
21
|
+
end
|
22
|
+
end
|
data/lib/docparser.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require 'docparser/parser'
|
@@ -0,0 +1,79 @@
|
|
1
|
+
require 'set'
|
2
|
+
module DocParser
|
3
|
+
# The Document class loads and parses the files.
|
4
|
+
# @see Parser
|
5
|
+
# @see Output
|
6
|
+
class Document
|
7
|
+
attr_reader :filename, :doc, :encoding, :results
|
8
|
+
def initialize(filename, encoding: 'utf-8', parser: nil)
|
9
|
+
if encoding == 'utf-8'
|
10
|
+
encodingstring = 'r:utf-8'
|
11
|
+
else
|
12
|
+
encodingstring = "r:#{encoding}:utf-8"
|
13
|
+
end
|
14
|
+
|
15
|
+
open(filename, encodingstring) do |f|
|
16
|
+
@doc = Nokogiri::HTML(f)
|
17
|
+
end
|
18
|
+
|
19
|
+
@encoding = encoding
|
20
|
+
@parser = parser
|
21
|
+
@filename = filename
|
22
|
+
@results = Array.new(@parser.outputs.length) { [] }
|
23
|
+
end
|
24
|
+
|
25
|
+
# Adds a row to an output
|
26
|
+
def add_row(*row, output: 0)
|
27
|
+
output = @parser.outputs.index(output) if output.is_a? Output
|
28
|
+
results[output] << row.flatten
|
29
|
+
end
|
30
|
+
|
31
|
+
# Extracts the document title
|
32
|
+
# @return [String] the title of the document
|
33
|
+
def title
|
34
|
+
@title ||= xpath_content('//head/title')
|
35
|
+
end
|
36
|
+
|
37
|
+
# @return [String] the source of the document
|
38
|
+
def html
|
39
|
+
@html ||= @doc.inner_html #TODO: ??
|
40
|
+
end
|
41
|
+
|
42
|
+
# Executes a xpath query
|
43
|
+
def xpath(query)
|
44
|
+
res = @doc.search(query)
|
45
|
+
res.each { |el| yield el } if block_given?
|
46
|
+
end
|
47
|
+
|
48
|
+
# Executes a xpath query and returns the content
|
49
|
+
# @return [String] the content of the HTML node
|
50
|
+
def xpath_content(query)
|
51
|
+
first = @doc.search(query).first
|
52
|
+
if first.nil?
|
53
|
+
nil
|
54
|
+
else
|
55
|
+
first.content
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
# Matches the HTML source using a regular expression
|
60
|
+
def regexp(regexp)
|
61
|
+
html.match(regexp) rescue nil
|
62
|
+
end
|
63
|
+
|
64
|
+
# Parses the document
|
65
|
+
# @return [Array] containing the parse results
|
66
|
+
def parse!(&block)
|
67
|
+
instance_exec(&block)
|
68
|
+
results
|
69
|
+
end
|
70
|
+
|
71
|
+
# @!visibility private
|
72
|
+
def inspect
|
73
|
+
"<Document file:'#{@filename}'>"
|
74
|
+
end
|
75
|
+
|
76
|
+
alias :css :xpath
|
77
|
+
alias :css_content :xpath_content
|
78
|
+
end
|
79
|
+
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
module DocParser
|
2
|
+
# The Output base class.
|
3
|
+
# All Output classes inherit from this one.
|
4
|
+
class Output
|
5
|
+
attr_reader :rowcount
|
6
|
+
|
7
|
+
# Creates a new output
|
8
|
+
# @param filename [String] Output filename
|
9
|
+
def initialize(filename: filename)
|
10
|
+
@rowcount = 0
|
11
|
+
@filename = filename
|
12
|
+
raise ArgumentError, 'Please specify a filename' if filename.empty?
|
13
|
+
@file = open filename, 'w'
|
14
|
+
open_file
|
15
|
+
end
|
16
|
+
|
17
|
+
# Stores the header
|
18
|
+
def header=(row)
|
19
|
+
@header = row
|
20
|
+
header
|
21
|
+
end
|
22
|
+
|
23
|
+
# Adds a row
|
24
|
+
def add_row(row)
|
25
|
+
@rowcount += 1
|
26
|
+
write_row row
|
27
|
+
end
|
28
|
+
|
29
|
+
# Closes output and IO
|
30
|
+
def close
|
31
|
+
footer
|
32
|
+
@file.close unless @file.closed?
|
33
|
+
end
|
34
|
+
|
35
|
+
# Called after the file is opened
|
36
|
+
def open_file
|
37
|
+
# do nothing
|
38
|
+
end
|
39
|
+
|
40
|
+
# Called after header is set
|
41
|
+
def header
|
42
|
+
# do nothing
|
43
|
+
end
|
44
|
+
|
45
|
+
# Called when a row is added
|
46
|
+
def write_row(row)
|
47
|
+
raise 'No row writer defined'
|
48
|
+
end
|
49
|
+
|
50
|
+
# Called before closing the file
|
51
|
+
def footer
|
52
|
+
end
|
53
|
+
|
54
|
+
# Displays information about the output
|
55
|
+
# @return [String] containing number of rows and file size
|
56
|
+
def summary
|
57
|
+
"%s:\t%d rows, %9.2f KiB" % [@filename,
|
58
|
+
@rowcount,
|
59
|
+
File.size(@filename) / 1024.0]
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
require 'csv'
|
2
|
+
module DocParser
|
3
|
+
# The CSVOutput class generates a CSV file containing all rows
|
4
|
+
# @see Output
|
5
|
+
|
6
|
+
class CSVOutput < Output
|
7
|
+
# @!visibility private
|
8
|
+
def open_file
|
9
|
+
@csv = CSV.new(@file, col_sep: ';')
|
10
|
+
end
|
11
|
+
|
12
|
+
def header
|
13
|
+
write_row @header
|
14
|
+
end
|
15
|
+
|
16
|
+
def write_row(row)
|
17
|
+
@csv << row
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,78 @@
|
|
1
|
+
require 'cgi'
|
2
|
+
module DocParser
|
3
|
+
# The XLSXOutput class generates an HTML file containing a table
|
4
|
+
# @see Output
|
5
|
+
class HTMLOutput < Output
|
6
|
+
# @!visibility private
|
7
|
+
HTMLHEADER = <<-EOS
|
8
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
|
9
|
+
"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
|
10
|
+
<html>
|
11
|
+
<head>
|
12
|
+
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
|
13
|
+
<title>HTML output "#FILENAME#"</title>
|
14
|
+
<style type="text/css">
|
15
|
+
body {
|
16
|
+
font-family:"Helvetica Neue", Helvetica, Sans-Serif;
|
17
|
+
font-size:12px;
|
18
|
+
}
|
19
|
+
table {
|
20
|
+
border:1px solid #69c;
|
21
|
+
border-collapse:collapse;
|
22
|
+
font-size:12px;
|
23
|
+
text-align:left;
|
24
|
+
width:480px;
|
25
|
+
}
|
26
|
+
th {
|
27
|
+
border-bottom:1px dashed #69c;
|
28
|
+
color:#039;
|
29
|
+
font-size:14px;
|
30
|
+
font-weight:normal;
|
31
|
+
padding:12px 17px;
|
32
|
+
}
|
33
|
+
td {
|
34
|
+
color:#669;
|
35
|
+
padding:7px 17px;
|
36
|
+
white-space: pre;
|
37
|
+
}
|
38
|
+
tbody tr:hover td {
|
39
|
+
background:#d0dafd;
|
40
|
+
color:#339;
|
41
|
+
}
|
42
|
+
tbody tr:nth-child(even) {
|
43
|
+
background:#e0eaff;
|
44
|
+
}
|
45
|
+
</style>
|
46
|
+
</head>
|
47
|
+
<body>
|
48
|
+
<table>
|
49
|
+
EOS
|
50
|
+
# @!visibility private
|
51
|
+
HTMLFOOTER = <<-EOS
|
52
|
+
</tbody>
|
53
|
+
</table>
|
54
|
+
<p>#COUNT# rows</p>
|
55
|
+
</body>
|
56
|
+
</html>
|
57
|
+
EOS
|
58
|
+
def open_file
|
59
|
+
@file << HTMLHEADER.gsub('#FILENAME#', @filename)
|
60
|
+
end
|
61
|
+
|
62
|
+
def header
|
63
|
+
@file << '<thead><tr>'
|
64
|
+
@file << @header.map { |f| '<th>' + f + '</th>' }.join
|
65
|
+
@file << "</tr></thead>\n<tbody>\n"
|
66
|
+
end
|
67
|
+
|
68
|
+
def write_row(row)
|
69
|
+
@file << '<tr>'
|
70
|
+
@file << row.map { |f| '<td>' + CGI.escapeHTML(f.to_s) + '</td>' }.join
|
71
|
+
@file << "</tr>\n"
|
72
|
+
end
|
73
|
+
|
74
|
+
def footer
|
75
|
+
@file << HTMLFOOTER.gsub('#COUNT#', @rowcount.to_s)
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
require 'json'
|
2
|
+
module DocParser
|
3
|
+
# The JSONOutput class generates a JSON file containing all rows as seperate
|
4
|
+
# JSON documents
|
5
|
+
# @see Output
|
6
|
+
class JSONOutput < Output
|
7
|
+
# @!visibility private
|
8
|
+
def open_file
|
9
|
+
@file << '['
|
10
|
+
@first = true
|
11
|
+
@doc = {}
|
12
|
+
end
|
13
|
+
|
14
|
+
def write_row(row)
|
15
|
+
if @first
|
16
|
+
@first = false
|
17
|
+
else
|
18
|
+
@file << ','
|
19
|
+
end
|
20
|
+
0.upto(@header.length - 1) do |counter|
|
21
|
+
@doc[@header[counter]] = row[counter] rescue ''
|
22
|
+
end
|
23
|
+
@file << JSON.dump(@doc)
|
24
|
+
end
|
25
|
+
|
26
|
+
def close
|
27
|
+
@file << ']'
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
module DocParser
|
2
|
+
# The MultiOutput output combines multiple outputs.
|
3
|
+
# It creates a CSV, HTML, YAML and XLSX Output file
|
4
|
+
# @see CSVOutput
|
5
|
+
# @see HTMLOutput
|
6
|
+
# @see YAMLOutput
|
7
|
+
# @see XLSXOutput
|
8
|
+
# @see Output
|
9
|
+
class MultiOutput < Output
|
10
|
+
# @!visibility private
|
11
|
+
def initialize(**options)
|
12
|
+
@outputs = []
|
13
|
+
csvoptions = options.clone
|
14
|
+
csvoptions[:filename] += '.csv'
|
15
|
+
htmloptions = options.clone
|
16
|
+
htmloptions[:filename] += '.html'
|
17
|
+
yamloptions = options.clone
|
18
|
+
yamloptions[:filename] += '.yml'
|
19
|
+
xlsxoptions = options.clone
|
20
|
+
xlsxoptions[:filename] += '.xlsx'
|
21
|
+
jsonoptions = options.clone
|
22
|
+
jsonoptions[:filename] += '.json'
|
23
|
+
@outputs << CSVOutput.new(csvoptions)
|
24
|
+
@outputs << HTMLOutput.new(htmloptions)
|
25
|
+
@outputs << YAMLOutput.new(yamloptions)
|
26
|
+
@outputs << XLSXOutput.new(xlsxoptions)
|
27
|
+
@outputs << XLSXOutput.new(jsonoptions)
|
28
|
+
end
|
29
|
+
|
30
|
+
def header=(row)
|
31
|
+
@outputs.each { |out| out.header = row.flatten }
|
32
|
+
end
|
33
|
+
|
34
|
+
def add_row(row)
|
35
|
+
@outputs.each { |out| out.add_row row.flatten }
|
36
|
+
end
|
37
|
+
|
38
|
+
def rowcount
|
39
|
+
@outputs.min { |out| out.rowcount }.rowcount
|
40
|
+
end
|
41
|
+
|
42
|
+
def close
|
43
|
+
@outputs.each { |out| out.close }
|
44
|
+
end
|
45
|
+
|
46
|
+
def summary
|
47
|
+
@outputs.map { |out| out.summary }.join("\n")
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'terminal-table'
|
2
|
+
require 'pageme'
|
3
|
+
module DocParser
|
4
|
+
# This Output is can be used for debugging purposes.
|
5
|
+
|
6
|
+
# It pipes all rows through a pager
|
7
|
+
# @see Output
|
8
|
+
class ScreenOutput < Output
|
9
|
+
# @!visibility private
|
10
|
+
|
11
|
+
include PageMe
|
12
|
+
|
13
|
+
def initialize
|
14
|
+
@tables = []
|
15
|
+
@rowcount = 0
|
16
|
+
end
|
17
|
+
|
18
|
+
def close
|
19
|
+
page do |p|
|
20
|
+
p.puts "Showing all #{@tables.length} rows:\n\n"
|
21
|
+
@tables.each do |table|
|
22
|
+
p.puts table
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def write_row(row)
|
28
|
+
out = []
|
29
|
+
0.upto(@header.length - 1) do |counter|
|
30
|
+
out << [@header[counter], row[counter]]
|
31
|
+
end
|
32
|
+
@tables << Terminal::Table.new(rows: out)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'axlsx'
|
2
|
+
module DocParser
|
3
|
+
# The XLSXOutput class generates Microsoft Excel compatible .xlsx files
|
4
|
+
# using the great axslx library
|
5
|
+
# @see Output
|
6
|
+
class XLSXOutput < Output
|
7
|
+
# @!visibility private
|
8
|
+
def open_file
|
9
|
+
@package = Axlsx::Package.new
|
10
|
+
@package.workbook.date1904 = false # Fix for OS X
|
11
|
+
@sheet = @package.workbook.add_worksheet
|
12
|
+
@file.close
|
13
|
+
end
|
14
|
+
|
15
|
+
def header
|
16
|
+
write_row @header
|
17
|
+
end
|
18
|
+
|
19
|
+
def write_row(row)
|
20
|
+
@sheet.add_row row
|
21
|
+
end
|
22
|
+
|
23
|
+
def close
|
24
|
+
@sheet.add_table "A1:#{@sheet.cells.last.r}", name: 'Data'
|
25
|
+
@package.serialize @filename
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
require 'yaml'
|
2
|
+
module DocParser
|
3
|
+
# The YAMLOutput class generates a YAML file containing all rows as seperate
|
4
|
+
# YAML documents
|
5
|
+
# @see Output
|
6
|
+
class YAMLOutput < Output
|
7
|
+
# @!visibility private
|
8
|
+
def write_row(row)
|
9
|
+
@doc ||= {}
|
10
|
+
0.upto(@header.length - 1) do |counter|
|
11
|
+
@doc[@header[counter]] = row[counter] rescue ''
|
12
|
+
end
|
13
|
+
YAML.dump @doc, @file
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,105 @@
|
|
1
|
+
$:.unshift __dir__
|
2
|
+
require 'rubygems'
|
3
|
+
require 'bundler/setup'
|
4
|
+
require 'version'
|
5
|
+
require 'output'
|
6
|
+
require 'document'
|
7
|
+
require 'nokogiri'
|
8
|
+
require 'open-uri'
|
9
|
+
require 'parallel'
|
10
|
+
require 'set'
|
11
|
+
require 'output/screen_output.rb'
|
12
|
+
require 'output/csv_output.rb'
|
13
|
+
require 'output/html_output.rb'
|
14
|
+
require 'output/xlsx_output.rb'
|
15
|
+
require 'output/yaml_output.rb'
|
16
|
+
require 'output/json_output.rb'
|
17
|
+
require 'output/multi_output.rb'
|
18
|
+
# {include:file:README.md}
|
19
|
+
module DocParser
|
20
|
+
# The main parser class. This is the class you'll use to create your parser
|
21
|
+
# The real work happens in the Document class
|
22
|
+
# @see Document
|
23
|
+
class Parser
|
24
|
+
# @!visibility private
|
25
|
+
attr_reader :outputs
|
26
|
+
|
27
|
+
# Creates a new parser instance
|
28
|
+
# @param files [Array] An array containing URLs or paths to files
|
29
|
+
# @param quiet [Boolean] Be quiet
|
30
|
+
# @param encoding [String] The encoding to use for opening the files
|
31
|
+
# @param parallel [Boolean] Use parallel processing
|
32
|
+
# @param output [Output, Array] The output(s), defaults to a Screenoutput
|
33
|
+
# @param range [Range] Range of files to process (nil means process all)
|
34
|
+
# @param num_processes [Fixnum] Number of parallel processes
|
35
|
+
def initialize(files: [], quiet: false, encoding: 'utf-8', parallel: true,
|
36
|
+
output: ScreenOutput.new, range: nil,
|
37
|
+
num_processes: Parallel.processor_count + 1)
|
38
|
+
@quiet = quiet
|
39
|
+
@parallel = parallel
|
40
|
+
@num_processes = num_processes
|
41
|
+
@encoding = encoding
|
42
|
+
if output.is_a? Output
|
43
|
+
@outputs = []
|
44
|
+
@outputs << output
|
45
|
+
elsif output.is_a?(Array) && output.all? { |o| o.is_a? Output }
|
46
|
+
@outputs = output
|
47
|
+
else
|
48
|
+
raise ArgumentError, 'No outputs specified'
|
49
|
+
end
|
50
|
+
@files = if range
|
51
|
+
files[range]
|
52
|
+
else
|
53
|
+
files
|
54
|
+
end
|
55
|
+
log 'DocParser loaded..'
|
56
|
+
log "#{@files.length} files loaded (encoding: #{@encoding})"
|
57
|
+
end
|
58
|
+
|
59
|
+
#
|
60
|
+
# Parses the `files`
|
61
|
+
#
|
62
|
+
def parse!(&block)
|
63
|
+
log "Parsing #{@files.length} files."
|
64
|
+
start_time = Time.now
|
65
|
+
resultsets = Array.new(@outputs.length) { Set.new }
|
66
|
+
|
67
|
+
if @parallel && @num_processes > 1
|
68
|
+
log "Starting #{@num_processes} processes"
|
69
|
+
Parallel.map(@files, in_processes: @num_processes) do |file|
|
70
|
+
Document.new(file, encoding: @encoding, parser: self).parse!(&block)
|
71
|
+
end.each do |result|
|
72
|
+
result.each_with_index { |set, index| resultsets[index].merge(set) }
|
73
|
+
end
|
74
|
+
log 'Parallel processing finished, writing results..'
|
75
|
+
else
|
76
|
+
@files.each do |file|
|
77
|
+
doc = Document.new(file, encoding: @encoding, parser: self)
|
78
|
+
doc.parse!(&block).each_with_index do |set, index|
|
79
|
+
resultsets[index].merge(set)
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
log "\nSummary\n======="
|
85
|
+
|
86
|
+
@outputs.each_with_index do |output, index|
|
87
|
+
resultsets[index].each do |row|
|
88
|
+
output.add_row row
|
89
|
+
end
|
90
|
+
resultsets[index] = nil
|
91
|
+
output.close
|
92
|
+
log output.summary
|
93
|
+
end
|
94
|
+
|
95
|
+
log ''
|
96
|
+
log 'Done processing in %.2fs.' % (Time.now - start_time)
|
97
|
+
end
|
98
|
+
|
99
|
+
private
|
100
|
+
|
101
|
+
def log(str)
|
102
|
+
puts str unless @quiet
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
metadata
ADDED
@@ -0,0 +1,93 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: docparser
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Jurriaan Pruis
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2013-04-11 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ~>
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.3'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ~>
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.3'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - '>='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
description: DocParser is a Ruby Gem for webscraping
|
42
|
+
email:
|
43
|
+
- email@jurriaanpruis.nl
|
44
|
+
executables: []
|
45
|
+
extensions: []
|
46
|
+
extra_rdoc_files: []
|
47
|
+
files:
|
48
|
+
- .gitignore
|
49
|
+
- .yardops
|
50
|
+
- Gemfile
|
51
|
+
- LICENSE
|
52
|
+
- README.md
|
53
|
+
- Rakefile
|
54
|
+
- docparser.gemspec
|
55
|
+
- example.rb
|
56
|
+
- lib/docparser.rb
|
57
|
+
- lib/docparser/document.rb
|
58
|
+
- lib/docparser/output.rb
|
59
|
+
- lib/docparser/output/csv_output.rb
|
60
|
+
- lib/docparser/output/html_output.rb
|
61
|
+
- lib/docparser/output/json_output.rb
|
62
|
+
- lib/docparser/output/multi_output.rb
|
63
|
+
- lib/docparser/output/screen_output.rb
|
64
|
+
- lib/docparser/output/xlsx_output.rb
|
65
|
+
- lib/docparser/output/yaml_output.rb
|
66
|
+
- lib/docparser/parser.rb
|
67
|
+
- lib/docparser/version.rb
|
68
|
+
homepage: https://github.com/jurriaan/docparser
|
69
|
+
licenses:
|
70
|
+
- MIT
|
71
|
+
metadata: {}
|
72
|
+
post_install_message:
|
73
|
+
rdoc_options: []
|
74
|
+
require_paths:
|
75
|
+
- lib
|
76
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
77
|
+
requirements:
|
78
|
+
- - '>='
|
79
|
+
- !ruby/object:Gem::Version
|
80
|
+
version: '0'
|
81
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
82
|
+
requirements:
|
83
|
+
- - '>='
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
version: '0'
|
86
|
+
requirements: []
|
87
|
+
rubyforge_project:
|
88
|
+
rubygems_version: 2.0.3
|
89
|
+
signing_key:
|
90
|
+
specification_version: 4
|
91
|
+
summary: DocParser is a Ruby Gem for webscraping
|
92
|
+
test_files: []
|
93
|
+
has_rdoc:
|