docparser 0.0.1 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +2 -0
- data/.rubocop.yml +5 -0
- data/.travis.yml +3 -0
- data/Gemfile +9 -1
- data/README.md +11 -4
- data/Rakefile +15 -0
- data/example.rb +9 -7
- data/lib/docparser.rb +1 -0
- data/lib/docparser/document.rb +18 -11
- data/lib/docparser/output.rb +8 -8
- data/lib/docparser/output/html_output.rb +53 -47
- data/lib/docparser/output/json_output.rb +8 -3
- data/lib/docparser/output/multi_output.rb +4 -8
- data/lib/docparser/output/nil_output.rb +21 -0
- data/lib/docparser/output/screen_output.rb +2 -1
- data/lib/docparser/output/xlsx_output.rb +12 -2
- data/lib/docparser/output/yaml_output.rb +6 -1
- data/lib/docparser/parser.rb +80 -49
- data/lib/docparser/version.rb +1 -1
- data/test/lib/docparser/blackbox_test.rb +29 -0
- data/test/lib/docparser/document_test.rb +134 -0
- data/test/lib/docparser/logging_test.rb +19 -0
- data/test/lib/docparser/output/csv_output_test.rb +51 -0
- data/test/lib/docparser/output/html_output_test.rb +57 -0
- data/test/lib/docparser/output/json_output_test.rb +65 -0
- data/test/lib/docparser/output/multi_output_test.rb +80 -0
- data/test/lib/docparser/output/nil_output_test.rb +27 -0
- data/test/lib/docparser/output/screen_output_test.rb +55 -0
- data/test/lib/docparser/output/xlsx_output_test.rb +53 -0
- data/test/lib/docparser/output/yaml_output_test.rb +76 -0
- data/test/lib/docparser/output_test.rb +85 -0
- data/test/lib/docparser/parser_test.rb +197 -0
- data/test/lib/docparser/version_test.rb +11 -0
- data/test/support/hackaday/dl.rb +4 -0
- data/test/support/hackaday/file_1.html +716 -0
- data/test/support/hackaday/file_10.html +791 -0
- data/test/support/hackaday/file_11.html +787 -0
- data/test/support/hackaday/file_12.html +715 -0
- data/test/support/hackaday/file_13.html +793 -0
- data/test/support/hackaday/file_14.html +718 -0
- data/test/support/hackaday/file_15.html +707 -0
- data/test/support/hackaday/file_16.html +713 -0
- data/test/support/hackaday/file_17.html +715 -0
- data/test/support/hackaday/file_18.html +725 -0
- data/test/support/hackaday/file_19.html +715 -0
- data/test/support/hackaday/file_2.html +793 -0
- data/test/support/hackaday/file_20.html +795 -0
- data/test/support/hackaday/file_21.html +804 -0
- data/test/support/hackaday/file_22.html +722 -0
- data/test/support/hackaday/file_23.html +793 -0
- data/test/support/hackaday/file_24.html +717 -0
- data/test/support/hackaday/file_25.html +715 -0
- data/test/support/hackaday/file_26.html +717 -0
- data/test/support/hackaday/file_27.html +723 -0
- data/test/support/hackaday/file_28.html +711 -0
- data/test/support/hackaday/file_29.html +711 -0
- data/test/support/hackaday/file_3.html +794 -0
- data/test/support/hackaday/file_30.html +715 -0
- data/test/support/hackaday/file_31.html +713 -0
- data/test/support/hackaday/file_32.html +714 -0
- data/test/support/hackaday/file_33.html +716 -0
- data/test/support/hackaday/file_34.html +714 -0
- data/test/support/hackaday/file_35.html +792 -0
- data/test/support/hackaday/file_36.html +719 -0
- data/test/support/hackaday/file_37.html +712 -0
- data/test/support/hackaday/file_38.html +709 -0
- data/test/support/hackaday/file_39.html +808 -0
- data/test/support/hackaday/file_4.html +814 -0
- data/test/support/hackaday/file_40.html +801 -0
- data/test/support/hackaday/file_5.html +715 -0
- data/test/support/hackaday/file_6.html +792 -0
- data/test/support/hackaday/file_7.html +714 -0
- data/test/support/hackaday/file_8.html +717 -0
- data/test/support/hackaday/file_9.html +719 -0
- data/test/support/test_encoding.html +12 -0
- data/test/support/test_encoding2.html +12 -0
- data/test/support/test_html.html +16 -0
- data/test/support/test_xml.xml +5 -0
- data/test/test_helper.rb +14 -0
- metadata +126 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0ba58a4708d78ae22fc79694754ddb70cc4fee63
|
4
|
+
data.tar.gz: cef89d6934e560633e8c3b05c8bb75e16e2c424b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e0db436a8578ca5d286c6a4946ea57a0c3ad38ed9d2db27803143fc3062c229c60dfe16ac156690d9e33b5f9041aa3bbff08b1d55f87f01467f18f97ef521d64
|
7
|
+
data.tar.gz: afca96d6dd7357fe08899d793e40ea6473e7bd9707318f93848cee8cd95c98d3ff57f6f9f8543a8b08174ceddf34d39aab83135fc242d89441d351c453bf7758
|
data/.gitignore
CHANGED
data/.rubocop.yml
ADDED
data/.travis.yml
ADDED
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -1,21 +1,27 @@
|
|
1
1
|
# DocParser
|
2
2
|
|
3
|
-
|
3
|
+
[![Build Status](https://travis-ci.org/jurriaan/docparser.png?branch=master)](https://travis-ci.org/jurriaan/docparser)
|
4
4
|
|
5
5
|
DocParser is a web scraping/screen scraping tool.
|
6
|
+
|
6
7
|
You can use it to easily scrape web sites.
|
7
8
|
|
9
|
+
The gem is called [docparser](http://rubygems.org/gems/docparser).
|
10
|
+
You can find the documentation [here](http://rubydoc.info/github/jurriaan/docparser/).
|
11
|
+
|
8
12
|
## Features
|
9
13
|
|
10
14
|
- XPath and CSS support through Nokogiri
|
11
15
|
- Support for loading of URLs throug open-uri
|
12
16
|
- Support for parallel processing of the documents
|
13
|
-
-
|
17
|
+
- 6 Output formats:
|
14
18
|
* CSV
|
15
19
|
* XLSX
|
16
20
|
* HTML
|
17
21
|
* YAML
|
22
|
+
* JSON
|
18
23
|
* Screen (for debugging and development)
|
24
|
+
* And more! (easy to extend)
|
19
25
|
|
20
26
|
## Installation
|
21
27
|
|
@@ -33,11 +39,12 @@ Or install it yourself as:
|
|
33
39
|
|
34
40
|
## Usage
|
35
41
|
|
36
|
-
See example.rb
|
42
|
+
See [example.rb](https://github.com/jurriaan/docparser/blob/master/example.rb)
|
37
43
|
|
38
44
|
## Todo
|
39
45
|
|
40
46
|
- Tests
|
47
|
+
- Better examples
|
41
48
|
|
42
49
|
## Contributing
|
43
50
|
|
@@ -49,4 +56,4 @@ See example.rb
|
|
49
56
|
|
50
57
|
## Contributors
|
51
58
|
|
52
|
-
- Jurriaan Pruis
|
59
|
+
- [Jurriaan Pruis](https://github.com/jurriaan)
|
data/Rakefile
CHANGED
@@ -1 +1,16 @@
|
|
1
1
|
require "bundler/gem_tasks"
|
2
|
+
require 'rake/testtask'
|
3
|
+
|
4
|
+
Rake::TestTask.new do |t|
|
5
|
+
t.libs << 'lib/docparser'
|
6
|
+
t.test_files = FileList['test/lib/**/*_test.rb']
|
7
|
+
t.verbose = true
|
8
|
+
end
|
9
|
+
|
10
|
+
task test: :rubocop
|
11
|
+
|
12
|
+
task :rubocop do
|
13
|
+
sh 'rubocop'
|
14
|
+
end
|
15
|
+
|
16
|
+
task :default => :test
|
data/example.rb
CHANGED
@@ -1,21 +1,23 @@
|
|
1
1
|
#
|
2
|
-
|
3
|
-
# An example of parsing a popular dutch website..
|
2
|
+
# An example of parsing hackaday.com
|
4
3
|
# (C) 2013 Jurriaan Pruis
|
5
4
|
#
|
5
|
+
$LOAD_PATH.unshift __dir__
|
6
|
+
require File.expand_path('lib/docparser.rb', __dir__)
|
7
|
+
require 'tmpdir'
|
6
8
|
|
7
|
-
require 'docparser'
|
8
9
|
include DocParser
|
9
|
-
output =
|
10
|
+
output = MultiOutput.new(filename: 'hackaday')
|
10
11
|
output.header = 'Title', 'Author', 'Publication date', 'URL', 'Summary'
|
11
|
-
|
12
|
+
files = Dir[File.join(__dir__, 'test/support/hackaday/*.html')]
|
13
|
+
parser = Parser.new(files: files, parallel: false, output: output)
|
12
14
|
parser.parse! do
|
13
15
|
css('#content .post') do |post|
|
14
16
|
title_el = post.search('.entry-title a').first
|
15
17
|
title = title_el.content
|
16
|
-
author =post.search('.post-info .author .fn a').first.content
|
18
|
+
author = post.search('.post-info .author .fn a').first.content
|
17
19
|
published_time = post.search('.post-info .date.published').first.content
|
18
|
-
url = title_el.attributes['href']
|
20
|
+
url = title_el.attributes['href'].value
|
19
21
|
summary = post.search('.entry-content').first.content.strip
|
20
22
|
add_row title, author, published_time, url, summary
|
21
23
|
end
|
data/lib/docparser.rb
CHANGED
data/lib/docparser/document.rb
CHANGED
@@ -5,26 +5,29 @@ module DocParser
|
|
5
5
|
# @see Output
|
6
6
|
class Document
|
7
7
|
attr_reader :filename, :doc, :encoding, :results
|
8
|
-
def initialize(filename, encoding: 'utf-8', parser: nil)
|
8
|
+
def initialize(filename: nil, encoding: 'utf-8', parser: nil)
|
9
9
|
if encoding == 'utf-8'
|
10
10
|
encodingstring = 'r:utf-8'
|
11
11
|
else
|
12
12
|
encodingstring = "r:#{encoding}:utf-8"
|
13
13
|
end
|
14
|
-
|
14
|
+
@logger = Log4r::Logger.new('docparser::document')
|
15
|
+
@logger.debug { "Parsing #{filename}" }
|
15
16
|
open(filename, encodingstring) do |f|
|
16
|
-
@
|
17
|
+
@html = f.read
|
18
|
+
@logger.warn "#{filename} is empty" if @html.empty?
|
19
|
+
@doc = Nokogiri(@html)
|
17
20
|
end
|
18
|
-
|
19
21
|
@encoding = encoding
|
20
22
|
@parser = parser
|
21
23
|
@filename = filename
|
22
|
-
@results = Array.new(@parser.outputs.length) { [] }
|
24
|
+
@results = Array.new(@parser.outputs ? @parser.outputs.length : 0) { [] }
|
23
25
|
end
|
24
26
|
|
25
27
|
# Adds a row to an output
|
26
28
|
def add_row(*row, output: 0)
|
27
29
|
output = @parser.outputs.index(output) if output.is_a? Output
|
30
|
+
@logger.debug { "#{filename}: Adding row #{row.flatten.to_s}" }
|
28
31
|
results[output] << row.flatten
|
29
32
|
end
|
30
33
|
|
@@ -36,13 +39,17 @@ module DocParser
|
|
36
39
|
|
37
40
|
# @return [String] the source of the document
|
38
41
|
def html
|
39
|
-
@html
|
42
|
+
@html
|
40
43
|
end
|
41
44
|
|
42
45
|
# Executes a xpath query
|
43
46
|
def xpath(query)
|
44
47
|
res = @doc.search(query)
|
45
|
-
|
48
|
+
if block_given?
|
49
|
+
res.each { |el| yield el }
|
50
|
+
else
|
51
|
+
res
|
52
|
+
end
|
46
53
|
end
|
47
54
|
|
48
55
|
# Executes a xpath query and returns the content
|
@@ -58,7 +65,7 @@ module DocParser
|
|
58
65
|
|
59
66
|
# Matches the HTML source using a regular expression
|
60
67
|
def regexp(regexp)
|
61
|
-
html.match(regexp)
|
68
|
+
html.match(regexp)
|
62
69
|
end
|
63
70
|
|
64
71
|
# Parses the document
|
@@ -70,10 +77,10 @@ module DocParser
|
|
70
77
|
|
71
78
|
# @!visibility private
|
72
79
|
def inspect
|
73
|
-
"<Document file:'#{@filename}'>"
|
80
|
+
"<Document file:'#{@filename}', encoding:'#{@encoding}'>"
|
74
81
|
end
|
75
82
|
|
76
|
-
|
77
|
-
|
83
|
+
alias_method :css, :xpath
|
84
|
+
alias_method :css_content, :xpath_content
|
78
85
|
end
|
79
86
|
end
|
data/lib/docparser/output.rb
CHANGED
@@ -11,6 +11,8 @@ module DocParser
|
|
11
11
|
@filename = filename
|
12
12
|
raise ArgumentError, 'Please specify a filename' if filename.empty?
|
13
13
|
@file = open filename, 'w'
|
14
|
+
classname = self.class.name.split('::').last
|
15
|
+
@logger = Log4r::Logger.new("docparser::output::#{classname}")
|
14
16
|
open_file
|
15
17
|
end
|
16
18
|
|
@@ -30,6 +32,9 @@ module DocParser
|
|
30
32
|
def close
|
31
33
|
footer
|
32
34
|
@file.close unless @file.closed?
|
35
|
+
@logger.info "Finished writing"
|
36
|
+
size = File.size(@filename) / 1024.0
|
37
|
+
@logger.info sprintf("%s: %d rows, %.2f KiB", @filename, rowcount, size)
|
33
38
|
end
|
34
39
|
|
35
40
|
# Called after the file is opened
|
@@ -44,19 +49,14 @@ module DocParser
|
|
44
49
|
|
45
50
|
# Called when a row is added
|
46
51
|
def write_row(row)
|
47
|
-
raise 'No row writer defined'
|
52
|
+
raise NotImplementedError.new('No row writer defined')
|
48
53
|
end
|
49
54
|
|
50
55
|
# Called before closing the file
|
51
56
|
def footer
|
52
57
|
end
|
58
|
+
end
|
53
59
|
|
54
|
-
|
55
|
-
# @return [String] containing number of rows and file size
|
56
|
-
def summary
|
57
|
-
"%s:\t%d rows, %9.2f KiB" % [@filename,
|
58
|
-
@rowcount,
|
59
|
-
File.size(@filename) / 1024.0]
|
60
|
-
end
|
60
|
+
class MissingHeaderException < StandardError
|
61
61
|
end
|
62
62
|
end
|
@@ -5,67 +5,73 @@ module DocParser
|
|
5
5
|
class HTMLOutput < Output
|
6
6
|
# @!visibility private
|
7
7
|
HTMLHEADER = <<-EOS
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
font-size:12px;
|
18
|
-
}
|
19
|
-
table {
|
20
|
-
border:1px solid #69c;
|
21
|
-
border-collapse:collapse;
|
8
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
|
9
|
+
"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
|
10
|
+
<html>
|
11
|
+
<head>
|
12
|
+
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
|
13
|
+
<title>HTML output "#FILENAME#"</title>
|
14
|
+
<style type="text/css">
|
15
|
+
body {
|
16
|
+
font-family:"Helvetica Neue", Helvetica, Sans-Serif;
|
22
17
|
font-size:12px;
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
18
|
+
}
|
19
|
+
table {
|
20
|
+
border:1px solid #69c;
|
21
|
+
border-collapse:collapse;
|
22
|
+
font-size:12px;
|
23
|
+
text-align:left;
|
24
|
+
width:480px;
|
25
|
+
}
|
26
|
+
th {
|
27
|
+
border-bottom:1px dashed #69c;
|
28
|
+
color:#039;
|
29
|
+
font-size:14px;
|
30
|
+
font-weight:normal;
|
31
|
+
padding:12px 17px;
|
32
|
+
}
|
33
|
+
td {
|
34
|
+
color:#669;
|
35
|
+
padding:7px 17px;
|
36
|
+
white-space: pre;
|
37
|
+
}
|
38
|
+
tbody tr:hover td {
|
39
|
+
background:#d0dafd;
|
40
|
+
color:#339;
|
41
|
+
}
|
42
|
+
tbody tr:nth-child(even) {
|
43
|
+
background:#e0eaff;
|
44
|
+
}
|
45
|
+
</style>
|
46
|
+
</head>
|
47
|
+
<body>
|
48
|
+
<table>
|
49
|
+
EOS
|
50
50
|
# @!visibility private
|
51
51
|
HTMLFOOTER = <<-EOS
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
52
|
+
</tbody>
|
53
|
+
</table>
|
54
|
+
<p>#COUNT# rows</p>
|
55
|
+
</body>
|
56
|
+
</html>
|
57
|
+
EOS
|
58
58
|
def open_file
|
59
59
|
@file << HTMLHEADER.gsub('#FILENAME#', @filename)
|
60
60
|
end
|
61
61
|
|
62
62
|
def header
|
63
|
+
return if @header.nil? || @header.empty?
|
63
64
|
@file << '<thead><tr>'
|
64
65
|
@file << @header.map { |f| '<th>' + f + '</th>' }.join
|
65
66
|
@file << "</tr></thead>\n<tbody>\n"
|
67
|
+
@tbody = true
|
66
68
|
end
|
67
69
|
|
68
70
|
def write_row(row)
|
71
|
+
unless @tbody
|
72
|
+
@file << "<tbody>\n"
|
73
|
+
@tbody = true
|
74
|
+
end
|
69
75
|
@file << '<tr>'
|
70
76
|
@file << row.map { |f| '<td>' + CGI.escapeHTML(f.to_s) + '</td>' }.join
|
71
77
|
@file << "</tr>\n"
|
@@ -1,7 +1,7 @@
|
|
1
1
|
require 'json'
|
2
2
|
module DocParser
|
3
3
|
# The JSONOutput class generates a JSON file containing all rows as seperate
|
4
|
-
#
|
4
|
+
# Array elements
|
5
5
|
# @see Output
|
6
6
|
class JSONOutput < Output
|
7
7
|
# @!visibility private
|
@@ -12,18 +12,23 @@ module DocParser
|
|
12
12
|
end
|
13
13
|
|
14
14
|
def write_row(row)
|
15
|
+
raise MissingHeaderException if @header.nil? || @header.length == 0
|
15
16
|
if @first
|
16
17
|
@first = false
|
17
18
|
else
|
18
19
|
@file << ','
|
19
20
|
end
|
20
21
|
0.upto(@header.length - 1) do |counter|
|
21
|
-
|
22
|
+
if row.length > counter
|
23
|
+
@doc[@header[counter]] = row[counter]
|
24
|
+
else
|
25
|
+
@doc[@header[counter]] = ''
|
26
|
+
end
|
22
27
|
end
|
23
28
|
@file << JSON.dump(@doc)
|
24
29
|
end
|
25
30
|
|
26
|
-
def
|
31
|
+
def footer
|
27
32
|
@file << ']'
|
28
33
|
end
|
29
34
|
end
|
@@ -24,27 +24,23 @@ module DocParser
|
|
24
24
|
@outputs << HTMLOutput.new(htmloptions)
|
25
25
|
@outputs << YAMLOutput.new(yamloptions)
|
26
26
|
@outputs << XLSXOutput.new(xlsxoptions)
|
27
|
-
@outputs <<
|
27
|
+
@outputs << JSONOutput.new(jsonoptions)
|
28
28
|
end
|
29
29
|
|
30
30
|
def header=(row)
|
31
|
-
@outputs.each { |out| out.header = row
|
31
|
+
@outputs.each { |out| out.header = row }
|
32
32
|
end
|
33
33
|
|
34
34
|
def add_row(row)
|
35
|
-
@outputs.each { |out| out.add_row row
|
35
|
+
@outputs.each { |out| out.add_row row }
|
36
36
|
end
|
37
37
|
|
38
38
|
def rowcount
|
39
|
-
@outputs.
|
39
|
+
@outputs.map { |out| out.rowcount }.min
|
40
40
|
end
|
41
41
|
|
42
42
|
def close
|
43
43
|
@outputs.each { |out| out.close }
|
44
44
|
end
|
45
|
-
|
46
|
-
def summary
|
47
|
-
@outputs.map { |out| out.summary }.join("\n")
|
48
|
-
end
|
49
45
|
end
|
50
46
|
end
|