docparser 0.0.1 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +2 -0
- data/.rubocop.yml +5 -0
- data/.travis.yml +3 -0
- data/Gemfile +9 -1
- data/README.md +11 -4
- data/Rakefile +15 -0
- data/example.rb +9 -7
- data/lib/docparser.rb +1 -0
- data/lib/docparser/document.rb +18 -11
- data/lib/docparser/output.rb +8 -8
- data/lib/docparser/output/html_output.rb +53 -47
- data/lib/docparser/output/json_output.rb +8 -3
- data/lib/docparser/output/multi_output.rb +4 -8
- data/lib/docparser/output/nil_output.rb +21 -0
- data/lib/docparser/output/screen_output.rb +2 -1
- data/lib/docparser/output/xlsx_output.rb +12 -2
- data/lib/docparser/output/yaml_output.rb +6 -1
- data/lib/docparser/parser.rb +80 -49
- data/lib/docparser/version.rb +1 -1
- data/test/lib/docparser/blackbox_test.rb +29 -0
- data/test/lib/docparser/document_test.rb +134 -0
- data/test/lib/docparser/logging_test.rb +19 -0
- data/test/lib/docparser/output/csv_output_test.rb +51 -0
- data/test/lib/docparser/output/html_output_test.rb +57 -0
- data/test/lib/docparser/output/json_output_test.rb +65 -0
- data/test/lib/docparser/output/multi_output_test.rb +80 -0
- data/test/lib/docparser/output/nil_output_test.rb +27 -0
- data/test/lib/docparser/output/screen_output_test.rb +55 -0
- data/test/lib/docparser/output/xlsx_output_test.rb +53 -0
- data/test/lib/docparser/output/yaml_output_test.rb +76 -0
- data/test/lib/docparser/output_test.rb +85 -0
- data/test/lib/docparser/parser_test.rb +197 -0
- data/test/lib/docparser/version_test.rb +11 -0
- data/test/support/hackaday/dl.rb +4 -0
- data/test/support/hackaday/file_1.html +716 -0
- data/test/support/hackaday/file_10.html +791 -0
- data/test/support/hackaday/file_11.html +787 -0
- data/test/support/hackaday/file_12.html +715 -0
- data/test/support/hackaday/file_13.html +793 -0
- data/test/support/hackaday/file_14.html +718 -0
- data/test/support/hackaday/file_15.html +707 -0
- data/test/support/hackaday/file_16.html +713 -0
- data/test/support/hackaday/file_17.html +715 -0
- data/test/support/hackaday/file_18.html +725 -0
- data/test/support/hackaday/file_19.html +715 -0
- data/test/support/hackaday/file_2.html +793 -0
- data/test/support/hackaday/file_20.html +795 -0
- data/test/support/hackaday/file_21.html +804 -0
- data/test/support/hackaday/file_22.html +722 -0
- data/test/support/hackaday/file_23.html +793 -0
- data/test/support/hackaday/file_24.html +717 -0
- data/test/support/hackaday/file_25.html +715 -0
- data/test/support/hackaday/file_26.html +717 -0
- data/test/support/hackaday/file_27.html +723 -0
- data/test/support/hackaday/file_28.html +711 -0
- data/test/support/hackaday/file_29.html +711 -0
- data/test/support/hackaday/file_3.html +794 -0
- data/test/support/hackaday/file_30.html +715 -0
- data/test/support/hackaday/file_31.html +713 -0
- data/test/support/hackaday/file_32.html +714 -0
- data/test/support/hackaday/file_33.html +716 -0
- data/test/support/hackaday/file_34.html +714 -0
- data/test/support/hackaday/file_35.html +792 -0
- data/test/support/hackaday/file_36.html +719 -0
- data/test/support/hackaday/file_37.html +712 -0
- data/test/support/hackaday/file_38.html +709 -0
- data/test/support/hackaday/file_39.html +808 -0
- data/test/support/hackaday/file_4.html +814 -0
- data/test/support/hackaday/file_40.html +801 -0
- data/test/support/hackaday/file_5.html +715 -0
- data/test/support/hackaday/file_6.html +792 -0
- data/test/support/hackaday/file_7.html +714 -0
- data/test/support/hackaday/file_8.html +717 -0
- data/test/support/hackaday/file_9.html +719 -0
- data/test/support/test_encoding.html +12 -0
- data/test/support/test_encoding2.html +12 -0
- data/test/support/test_html.html +16 -0
- data/test/support/test_xml.xml +5 -0
- data/test/test_helper.rb +14 -0
- metadata +126 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0ba58a4708d78ae22fc79694754ddb70cc4fee63
|
4
|
+
data.tar.gz: cef89d6934e560633e8c3b05c8bb75e16e2c424b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e0db436a8578ca5d286c6a4946ea57a0c3ad38ed9d2db27803143fc3062c229c60dfe16ac156690d9e33b5f9041aa3bbff08b1d55f87f01467f18f97ef521d64
|
7
|
+
data.tar.gz: afca96d6dd7357fe08899d793e40ea6473e7bd9707318f93848cee8cd95c98d3ff57f6f9f8543a8b08174ceddf34d39aab83135fc242d89441d351c453bf7758
|
data/.gitignore
CHANGED
data/.rubocop.yml
ADDED
data/.travis.yml
ADDED
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -1,21 +1,27 @@
|
|
1
1
|
# DocParser
|
2
2
|
|
3
|
-
|
3
|
+
[](https://travis-ci.org/jurriaan/docparser)
|
4
4
|
|
5
5
|
DocParser is a web scraping/screen scraping tool.
|
6
|
+
|
6
7
|
You can use it to easily scrape web sites.
|
7
8
|
|
9
|
+
The gem is called [docparser](http://rubygems.org/gems/docparser).
|
10
|
+
You can find the documentation [here](http://rubydoc.info/github/jurriaan/docparser/).
|
11
|
+
|
8
12
|
## Features
|
9
13
|
|
10
14
|
- XPath and CSS support through Nokogiri
|
11
15
|
- Support for loading of URLs throug open-uri
|
12
16
|
- Support for parallel processing of the documents
|
13
|
-
-
|
17
|
+
- 6 Output formats:
|
14
18
|
* CSV
|
15
19
|
* XLSX
|
16
20
|
* HTML
|
17
21
|
* YAML
|
22
|
+
* JSON
|
18
23
|
* Screen (for debugging and development)
|
24
|
+
* And more! (easy to extend)
|
19
25
|
|
20
26
|
## Installation
|
21
27
|
|
@@ -33,11 +39,12 @@ Or install it yourself as:
|
|
33
39
|
|
34
40
|
## Usage
|
35
41
|
|
36
|
-
See example.rb
|
42
|
+
See [example.rb](https://github.com/jurriaan/docparser/blob/master/example.rb)
|
37
43
|
|
38
44
|
## Todo
|
39
45
|
|
40
46
|
- Tests
|
47
|
+
- Better examples
|
41
48
|
|
42
49
|
## Contributing
|
43
50
|
|
@@ -49,4 +56,4 @@ See example.rb
|
|
49
56
|
|
50
57
|
## Contributors
|
51
58
|
|
52
|
-
- Jurriaan Pruis
|
59
|
+
- [Jurriaan Pruis](https://github.com/jurriaan)
|
data/Rakefile
CHANGED
@@ -1 +1,16 @@
|
|
1
1
|
require "bundler/gem_tasks"
|
2
|
+
require 'rake/testtask'
|
3
|
+
|
4
|
+
Rake::TestTask.new do |t|
|
5
|
+
t.libs << 'lib/docparser'
|
6
|
+
t.test_files = FileList['test/lib/**/*_test.rb']
|
7
|
+
t.verbose = true
|
8
|
+
end
|
9
|
+
|
10
|
+
task test: :rubocop
|
11
|
+
|
12
|
+
task :rubocop do
|
13
|
+
sh 'rubocop'
|
14
|
+
end
|
15
|
+
|
16
|
+
task :default => :test
|
data/example.rb
CHANGED
@@ -1,21 +1,23 @@
|
|
1
1
|
#
|
2
|
-
|
3
|
-
# An example of parsing a popular dutch website..
|
2
|
+
# An example of parsing hackaday.com
|
4
3
|
# (C) 2013 Jurriaan Pruis
|
5
4
|
#
|
5
|
+
$LOAD_PATH.unshift __dir__
|
6
|
+
require File.expand_path('lib/docparser.rb', __dir__)
|
7
|
+
require 'tmpdir'
|
6
8
|
|
7
|
-
require 'docparser'
|
8
9
|
include DocParser
|
9
|
-
output =
|
10
|
+
output = MultiOutput.new(filename: 'hackaday')
|
10
11
|
output.header = 'Title', 'Author', 'Publication date', 'URL', 'Summary'
|
11
|
-
|
12
|
+
files = Dir[File.join(__dir__, 'test/support/hackaday/*.html')]
|
13
|
+
parser = Parser.new(files: files, parallel: false, output: output)
|
12
14
|
parser.parse! do
|
13
15
|
css('#content .post') do |post|
|
14
16
|
title_el = post.search('.entry-title a').first
|
15
17
|
title = title_el.content
|
16
|
-
author =post.search('.post-info .author .fn a').first.content
|
18
|
+
author = post.search('.post-info .author .fn a').first.content
|
17
19
|
published_time = post.search('.post-info .date.published').first.content
|
18
|
-
url = title_el.attributes['href']
|
20
|
+
url = title_el.attributes['href'].value
|
19
21
|
summary = post.search('.entry-content').first.content.strip
|
20
22
|
add_row title, author, published_time, url, summary
|
21
23
|
end
|
data/lib/docparser.rb
CHANGED
data/lib/docparser/document.rb
CHANGED
@@ -5,26 +5,29 @@ module DocParser
|
|
5
5
|
# @see Output
|
6
6
|
class Document
|
7
7
|
attr_reader :filename, :doc, :encoding, :results
|
8
|
-
def initialize(filename, encoding: 'utf-8', parser: nil)
|
8
|
+
def initialize(filename: nil, encoding: 'utf-8', parser: nil)
|
9
9
|
if encoding == 'utf-8'
|
10
10
|
encodingstring = 'r:utf-8'
|
11
11
|
else
|
12
12
|
encodingstring = "r:#{encoding}:utf-8"
|
13
13
|
end
|
14
|
-
|
14
|
+
@logger = Log4r::Logger.new('docparser::document')
|
15
|
+
@logger.debug { "Parsing #{filename}" }
|
15
16
|
open(filename, encodingstring) do |f|
|
16
|
-
@
|
17
|
+
@html = f.read
|
18
|
+
@logger.warn "#{filename} is empty" if @html.empty?
|
19
|
+
@doc = Nokogiri(@html)
|
17
20
|
end
|
18
|
-
|
19
21
|
@encoding = encoding
|
20
22
|
@parser = parser
|
21
23
|
@filename = filename
|
22
|
-
@results = Array.new(@parser.outputs.length) { [] }
|
24
|
+
@results = Array.new(@parser.outputs ? @parser.outputs.length : 0) { [] }
|
23
25
|
end
|
24
26
|
|
25
27
|
# Adds a row to an output
|
26
28
|
def add_row(*row, output: 0)
|
27
29
|
output = @parser.outputs.index(output) if output.is_a? Output
|
30
|
+
@logger.debug { "#{filename}: Adding row #{row.flatten.to_s}" }
|
28
31
|
results[output] << row.flatten
|
29
32
|
end
|
30
33
|
|
@@ -36,13 +39,17 @@ module DocParser
|
|
36
39
|
|
37
40
|
# @return [String] the source of the document
|
38
41
|
def html
|
39
|
-
@html
|
42
|
+
@html
|
40
43
|
end
|
41
44
|
|
42
45
|
# Executes a xpath query
|
43
46
|
def xpath(query)
|
44
47
|
res = @doc.search(query)
|
45
|
-
|
48
|
+
if block_given?
|
49
|
+
res.each { |el| yield el }
|
50
|
+
else
|
51
|
+
res
|
52
|
+
end
|
46
53
|
end
|
47
54
|
|
48
55
|
# Executes a xpath query and returns the content
|
@@ -58,7 +65,7 @@ module DocParser
|
|
58
65
|
|
59
66
|
# Matches the HTML source using a regular expression
|
60
67
|
def regexp(regexp)
|
61
|
-
html.match(regexp)
|
68
|
+
html.match(regexp)
|
62
69
|
end
|
63
70
|
|
64
71
|
# Parses the document
|
@@ -70,10 +77,10 @@ module DocParser
|
|
70
77
|
|
71
78
|
# @!visibility private
|
72
79
|
def inspect
|
73
|
-
"<Document file:'#{@filename}'>"
|
80
|
+
"<Document file:'#{@filename}', encoding:'#{@encoding}'>"
|
74
81
|
end
|
75
82
|
|
76
|
-
|
77
|
-
|
83
|
+
alias_method :css, :xpath
|
84
|
+
alias_method :css_content, :xpath_content
|
78
85
|
end
|
79
86
|
end
|
data/lib/docparser/output.rb
CHANGED
@@ -11,6 +11,8 @@ module DocParser
|
|
11
11
|
@filename = filename
|
12
12
|
raise ArgumentError, 'Please specify a filename' if filename.empty?
|
13
13
|
@file = open filename, 'w'
|
14
|
+
classname = self.class.name.split('::').last
|
15
|
+
@logger = Log4r::Logger.new("docparser::output::#{classname}")
|
14
16
|
open_file
|
15
17
|
end
|
16
18
|
|
@@ -30,6 +32,9 @@ module DocParser
|
|
30
32
|
def close
|
31
33
|
footer
|
32
34
|
@file.close unless @file.closed?
|
35
|
+
@logger.info "Finished writing"
|
36
|
+
size = File.size(@filename) / 1024.0
|
37
|
+
@logger.info sprintf("%s: %d rows, %.2f KiB", @filename, rowcount, size)
|
33
38
|
end
|
34
39
|
|
35
40
|
# Called after the file is opened
|
@@ -44,19 +49,14 @@ module DocParser
|
|
44
49
|
|
45
50
|
# Called when a row is added
|
46
51
|
def write_row(row)
|
47
|
-
raise 'No row writer defined'
|
52
|
+
raise NotImplementedError.new('No row writer defined')
|
48
53
|
end
|
49
54
|
|
50
55
|
# Called before closing the file
|
51
56
|
def footer
|
52
57
|
end
|
58
|
+
end
|
53
59
|
|
54
|
-
|
55
|
-
# @return [String] containing number of rows and file size
|
56
|
-
def summary
|
57
|
-
"%s:\t%d rows, %9.2f KiB" % [@filename,
|
58
|
-
@rowcount,
|
59
|
-
File.size(@filename) / 1024.0]
|
60
|
-
end
|
60
|
+
class MissingHeaderException < StandardError
|
61
61
|
end
|
62
62
|
end
|
@@ -5,67 +5,73 @@ module DocParser
|
|
5
5
|
class HTMLOutput < Output
|
6
6
|
# @!visibility private
|
7
7
|
HTMLHEADER = <<-EOS
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
font-size:12px;
|
18
|
-
}
|
19
|
-
table {
|
20
|
-
border:1px solid #69c;
|
21
|
-
border-collapse:collapse;
|
8
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
|
9
|
+
"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
|
10
|
+
<html>
|
11
|
+
<head>
|
12
|
+
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
|
13
|
+
<title>HTML output "#FILENAME#"</title>
|
14
|
+
<style type="text/css">
|
15
|
+
body {
|
16
|
+
font-family:"Helvetica Neue", Helvetica, Sans-Serif;
|
22
17
|
font-size:12px;
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
18
|
+
}
|
19
|
+
table {
|
20
|
+
border:1px solid #69c;
|
21
|
+
border-collapse:collapse;
|
22
|
+
font-size:12px;
|
23
|
+
text-align:left;
|
24
|
+
width:480px;
|
25
|
+
}
|
26
|
+
th {
|
27
|
+
border-bottom:1px dashed #69c;
|
28
|
+
color:#039;
|
29
|
+
font-size:14px;
|
30
|
+
font-weight:normal;
|
31
|
+
padding:12px 17px;
|
32
|
+
}
|
33
|
+
td {
|
34
|
+
color:#669;
|
35
|
+
padding:7px 17px;
|
36
|
+
white-space: pre;
|
37
|
+
}
|
38
|
+
tbody tr:hover td {
|
39
|
+
background:#d0dafd;
|
40
|
+
color:#339;
|
41
|
+
}
|
42
|
+
tbody tr:nth-child(even) {
|
43
|
+
background:#e0eaff;
|
44
|
+
}
|
45
|
+
</style>
|
46
|
+
</head>
|
47
|
+
<body>
|
48
|
+
<table>
|
49
|
+
EOS
|
50
50
|
# @!visibility private
|
51
51
|
HTMLFOOTER = <<-EOS
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
52
|
+
</tbody>
|
53
|
+
</table>
|
54
|
+
<p>#COUNT# rows</p>
|
55
|
+
</body>
|
56
|
+
</html>
|
57
|
+
EOS
|
58
58
|
def open_file
|
59
59
|
@file << HTMLHEADER.gsub('#FILENAME#', @filename)
|
60
60
|
end
|
61
61
|
|
62
62
|
def header
|
63
|
+
return if @header.nil? || @header.empty?
|
63
64
|
@file << '<thead><tr>'
|
64
65
|
@file << @header.map { |f| '<th>' + f + '</th>' }.join
|
65
66
|
@file << "</tr></thead>\n<tbody>\n"
|
67
|
+
@tbody = true
|
66
68
|
end
|
67
69
|
|
68
70
|
def write_row(row)
|
71
|
+
unless @tbody
|
72
|
+
@file << "<tbody>\n"
|
73
|
+
@tbody = true
|
74
|
+
end
|
69
75
|
@file << '<tr>'
|
70
76
|
@file << row.map { |f| '<td>' + CGI.escapeHTML(f.to_s) + '</td>' }.join
|
71
77
|
@file << "</tr>\n"
|
@@ -1,7 +1,7 @@
|
|
1
1
|
require 'json'
|
2
2
|
module DocParser
|
3
3
|
# The JSONOutput class generates a JSON file containing all rows as seperate
|
4
|
-
#
|
4
|
+
# Array elements
|
5
5
|
# @see Output
|
6
6
|
class JSONOutput < Output
|
7
7
|
# @!visibility private
|
@@ -12,18 +12,23 @@ module DocParser
|
|
12
12
|
end
|
13
13
|
|
14
14
|
def write_row(row)
|
15
|
+
raise MissingHeaderException if @header.nil? || @header.length == 0
|
15
16
|
if @first
|
16
17
|
@first = false
|
17
18
|
else
|
18
19
|
@file << ','
|
19
20
|
end
|
20
21
|
0.upto(@header.length - 1) do |counter|
|
21
|
-
|
22
|
+
if row.length > counter
|
23
|
+
@doc[@header[counter]] = row[counter]
|
24
|
+
else
|
25
|
+
@doc[@header[counter]] = ''
|
26
|
+
end
|
22
27
|
end
|
23
28
|
@file << JSON.dump(@doc)
|
24
29
|
end
|
25
30
|
|
26
|
-
def
|
31
|
+
def footer
|
27
32
|
@file << ']'
|
28
33
|
end
|
29
34
|
end
|
@@ -24,27 +24,23 @@ module DocParser
|
|
24
24
|
@outputs << HTMLOutput.new(htmloptions)
|
25
25
|
@outputs << YAMLOutput.new(yamloptions)
|
26
26
|
@outputs << XLSXOutput.new(xlsxoptions)
|
27
|
-
@outputs <<
|
27
|
+
@outputs << JSONOutput.new(jsonoptions)
|
28
28
|
end
|
29
29
|
|
30
30
|
def header=(row)
|
31
|
-
@outputs.each { |out| out.header = row
|
31
|
+
@outputs.each { |out| out.header = row }
|
32
32
|
end
|
33
33
|
|
34
34
|
def add_row(row)
|
35
|
-
@outputs.each { |out| out.add_row row
|
35
|
+
@outputs.each { |out| out.add_row row }
|
36
36
|
end
|
37
37
|
|
38
38
|
def rowcount
|
39
|
-
@outputs.
|
39
|
+
@outputs.map { |out| out.rowcount }.min
|
40
40
|
end
|
41
41
|
|
42
42
|
def close
|
43
43
|
@outputs.each { |out| out.close }
|
44
44
|
end
|
45
|
-
|
46
|
-
def summary
|
47
|
-
@outputs.map { |out| out.summary }.join("\n")
|
48
|
-
end
|
49
45
|
end
|
50
46
|
end
|