docparser 0.0.1 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +2 -0
  3. data/.rubocop.yml +5 -0
  4. data/.travis.yml +3 -0
  5. data/Gemfile +9 -1
  6. data/README.md +11 -4
  7. data/Rakefile +15 -0
  8. data/example.rb +9 -7
  9. data/lib/docparser.rb +1 -0
  10. data/lib/docparser/document.rb +18 -11
  11. data/lib/docparser/output.rb +8 -8
  12. data/lib/docparser/output/html_output.rb +53 -47
  13. data/lib/docparser/output/json_output.rb +8 -3
  14. data/lib/docparser/output/multi_output.rb +4 -8
  15. data/lib/docparser/output/nil_output.rb +21 -0
  16. data/lib/docparser/output/screen_output.rb +2 -1
  17. data/lib/docparser/output/xlsx_output.rb +12 -2
  18. data/lib/docparser/output/yaml_output.rb +6 -1
  19. data/lib/docparser/parser.rb +80 -49
  20. data/lib/docparser/version.rb +1 -1
  21. data/test/lib/docparser/blackbox_test.rb +29 -0
  22. data/test/lib/docparser/document_test.rb +134 -0
  23. data/test/lib/docparser/logging_test.rb +19 -0
  24. data/test/lib/docparser/output/csv_output_test.rb +51 -0
  25. data/test/lib/docparser/output/html_output_test.rb +57 -0
  26. data/test/lib/docparser/output/json_output_test.rb +65 -0
  27. data/test/lib/docparser/output/multi_output_test.rb +80 -0
  28. data/test/lib/docparser/output/nil_output_test.rb +27 -0
  29. data/test/lib/docparser/output/screen_output_test.rb +55 -0
  30. data/test/lib/docparser/output/xlsx_output_test.rb +53 -0
  31. data/test/lib/docparser/output/yaml_output_test.rb +76 -0
  32. data/test/lib/docparser/output_test.rb +85 -0
  33. data/test/lib/docparser/parser_test.rb +197 -0
  34. data/test/lib/docparser/version_test.rb +11 -0
  35. data/test/support/hackaday/dl.rb +4 -0
  36. data/test/support/hackaday/file_1.html +716 -0
  37. data/test/support/hackaday/file_10.html +791 -0
  38. data/test/support/hackaday/file_11.html +787 -0
  39. data/test/support/hackaday/file_12.html +715 -0
  40. data/test/support/hackaday/file_13.html +793 -0
  41. data/test/support/hackaday/file_14.html +718 -0
  42. data/test/support/hackaday/file_15.html +707 -0
  43. data/test/support/hackaday/file_16.html +713 -0
  44. data/test/support/hackaday/file_17.html +715 -0
  45. data/test/support/hackaday/file_18.html +725 -0
  46. data/test/support/hackaday/file_19.html +715 -0
  47. data/test/support/hackaday/file_2.html +793 -0
  48. data/test/support/hackaday/file_20.html +795 -0
  49. data/test/support/hackaday/file_21.html +804 -0
  50. data/test/support/hackaday/file_22.html +722 -0
  51. data/test/support/hackaday/file_23.html +793 -0
  52. data/test/support/hackaday/file_24.html +717 -0
  53. data/test/support/hackaday/file_25.html +715 -0
  54. data/test/support/hackaday/file_26.html +717 -0
  55. data/test/support/hackaday/file_27.html +723 -0
  56. data/test/support/hackaday/file_28.html +711 -0
  57. data/test/support/hackaday/file_29.html +711 -0
  58. data/test/support/hackaday/file_3.html +794 -0
  59. data/test/support/hackaday/file_30.html +715 -0
  60. data/test/support/hackaday/file_31.html +713 -0
  61. data/test/support/hackaday/file_32.html +714 -0
  62. data/test/support/hackaday/file_33.html +716 -0
  63. data/test/support/hackaday/file_34.html +714 -0
  64. data/test/support/hackaday/file_35.html +792 -0
  65. data/test/support/hackaday/file_36.html +719 -0
  66. data/test/support/hackaday/file_37.html +712 -0
  67. data/test/support/hackaday/file_38.html +709 -0
  68. data/test/support/hackaday/file_39.html +808 -0
  69. data/test/support/hackaday/file_4.html +814 -0
  70. data/test/support/hackaday/file_40.html +801 -0
  71. data/test/support/hackaday/file_5.html +715 -0
  72. data/test/support/hackaday/file_6.html +792 -0
  73. data/test/support/hackaday/file_7.html +714 -0
  74. data/test/support/hackaday/file_8.html +717 -0
  75. data/test/support/hackaday/file_9.html +719 -0
  76. data/test/support/test_encoding.html +12 -0
  77. data/test/support/test_encoding2.html +12 -0
  78. data/test/support/test_html.html +16 -0
  79. data/test/support/test_xml.xml +5 -0
  80. data/test/test_helper.rb +14 -0
  81. metadata +126 -3
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 829c9585c7ac42c2496a8b385f2f19d4486a5e10
4
- data.tar.gz: 39f576740941b72e61babec808af6756adfeb1b8
3
+ metadata.gz: 0ba58a4708d78ae22fc79694754ddb70cc4fee63
4
+ data.tar.gz: cef89d6934e560633e8c3b05c8bb75e16e2c424b
5
5
  SHA512:
6
- metadata.gz: 32a9dc03bb9f413641b25cb0e51c9951fa8e1a64a48f4b1c8cc8e1e3877cbaa87b1ac70738f85c20f7383a3446d5a37915ee53a09fd30099b00be553779f4f4a
7
- data.tar.gz: b106a3fc8843a2d47f9d8c4f5ac65e39afc702fff8dcabfdfe89e0477731ac1c9576e9b795dcda53c927c8acc718846b2bb277a0e7e68654ead52cb97e612b3e
6
+ metadata.gz: e0db436a8578ca5d286c6a4946ea57a0c3ad38ed9d2db27803143fc3062c229c60dfe16ac156690d9e33b5f9041aa3bbff08b1d55f87f01467f18f97ef521d64
7
+ data.tar.gz: afca96d6dd7357fe08899d793e40ea6473e7bd9707318f93848cee8cd95c98d3ff57f6f9f8543a8b08174ceddf34d39aab83135fc242d89441d351c453bf7758
data/.gitignore CHANGED
@@ -16,3 +16,5 @@ test/tmp
16
16
  test/version_tmp
17
17
  tmp
18
18
  hackaday.*
19
+
20
+ .DS_Store
data/.rubocop.yml ADDED
@@ -0,0 +1,5 @@
1
+ # Avoid methods longer than 10 lines of code
2
+ MethodLength:
3
+ Enabled: true
4
+ CountComments: false # count full line comments?
5
+ Max: 20
data/.travis.yml ADDED
@@ -0,0 +1,3 @@
1
+ language: ruby
2
+ rvm:
3
+ - 2.0.0
data/Gemfile CHANGED
@@ -6,4 +6,12 @@ gem 'parallel'
6
6
  gem 'axlsx'
7
7
  gem 'terminal-table'
8
8
  gem 'pageme'
9
- gem "json"
9
+ gem 'json'
10
+ gem 'log4r'
11
+
12
+ group :test do
13
+ gem 'rake'
14
+ gem 'rubocop'
15
+ gem 'simplecov', :require => false
16
+ gem 'simple_mock'
17
+ end
data/README.md CHANGED
@@ -1,21 +1,27 @@
1
1
  # DocParser
2
2
 
3
- Docs: http://rubydoc.info/github/jurriaan/docparser/
3
+ [![Build Status](https://travis-ci.org/jurriaan/docparser.png?branch=master)](https://travis-ci.org/jurriaan/docparser)
4
4
 
5
5
  DocParser is a web scraping/screen scraping tool.
6
+
6
7
  You can use it to easily scrape web sites.
7
8
 
9
+ The gem is called [docparser](http://rubygems.org/gems/docparser).
10
+ You can find the documentation [here](http://rubydoc.info/github/jurriaan/docparser/).
11
+
8
12
  ## Features
9
13
 
10
14
  - XPath and CSS support through Nokogiri
11
15
  - Support for loading of URLs throug open-uri
12
16
  - Support for parallel processing of the documents
13
- - 5 Output formats:
17
+ - 6 Output formats:
14
18
  * CSV
15
19
  * XLSX
16
20
  * HTML
17
21
  * YAML
22
+ * JSON
18
23
  * Screen (for debugging and development)
24
+ * And more! (easy to extend)
19
25
 
20
26
  ## Installation
21
27
 
@@ -33,11 +39,12 @@ Or install it yourself as:
33
39
 
34
40
  ## Usage
35
41
 
36
- See example.rb
42
+ See [example.rb](https://github.com/jurriaan/docparser/blob/master/example.rb)
37
43
 
38
44
  ## Todo
39
45
 
40
46
  - Tests
47
+ - Better examples
41
48
 
42
49
  ## Contributing
43
50
 
@@ -49,4 +56,4 @@ See example.rb
49
56
 
50
57
  ## Contributors
51
58
 
52
- - Jurriaan Pruis
59
+ - [Jurriaan Pruis](https://github.com/jurriaan)
data/Rakefile CHANGED
@@ -1 +1,16 @@
1
1
  require "bundler/gem_tasks"
2
+ require 'rake/testtask'
3
+
4
+ Rake::TestTask.new do |t|
5
+ t.libs << 'lib/docparser'
6
+ t.test_files = FileList['test/lib/**/*_test.rb']
7
+ t.verbose = true
8
+ end
9
+
10
+ task test: :rubocop
11
+
12
+ task :rubocop do
13
+ sh 'rubocop'
14
+ end
15
+
16
+ task :default => :test
data/example.rb CHANGED
@@ -1,21 +1,23 @@
1
1
  #
2
-
3
- # An example of parsing a popular dutch website..
2
+ # An example of parsing hackaday.com
4
3
  # (C) 2013 Jurriaan Pruis
5
4
  #
5
+ $LOAD_PATH.unshift __dir__
6
+ require File.expand_path('lib/docparser.rb', __dir__)
7
+ require 'tmpdir'
6
8
 
7
- require 'docparser'
8
9
  include DocParser
9
- output = HTMLOutput.new filename: 'hackaday.html'
10
+ output = MultiOutput.new(filename: 'hackaday')
10
11
  output.header = 'Title', 'Author', 'Publication date', 'URL', 'Summary'
11
- parser = Parser.new(files: (1..20).map {|i| "http://hackaday.com/page/#{i}/"}, parallel: false, output: output)
12
+ files = Dir[File.join(__dir__, 'test/support/hackaday/*.html')]
13
+ parser = Parser.new(files: files, parallel: false, output: output)
12
14
  parser.parse! do
13
15
  css('#content .post') do |post|
14
16
  title_el = post.search('.entry-title a').first
15
17
  title = title_el.content
16
- author =post.search('.post-info .author .fn a').first.content
18
+ author = post.search('.post-info .author .fn a').first.content
17
19
  published_time = post.search('.post-info .date.published').first.content
18
- url = title_el.attributes['href']
20
+ url = title_el.attributes['href'].value
19
21
  summary = post.search('.entry-content').first.content.strip
20
22
  add_row title, author, published_time, url, summary
21
23
  end
data/lib/docparser.rb CHANGED
@@ -1 +1,2 @@
1
+ $LOAD_PATH.unshift __dir__
1
2
  require 'docparser/parser'
@@ -5,26 +5,29 @@ module DocParser
5
5
  # @see Output
6
6
  class Document
7
7
  attr_reader :filename, :doc, :encoding, :results
8
- def initialize(filename, encoding: 'utf-8', parser: nil)
8
+ def initialize(filename: nil, encoding: 'utf-8', parser: nil)
9
9
  if encoding == 'utf-8'
10
10
  encodingstring = 'r:utf-8'
11
11
  else
12
12
  encodingstring = "r:#{encoding}:utf-8"
13
13
  end
14
-
14
+ @logger = Log4r::Logger.new('docparser::document')
15
+ @logger.debug { "Parsing #{filename}" }
15
16
  open(filename, encodingstring) do |f|
16
- @doc = Nokogiri::HTML(f)
17
+ @html = f.read
18
+ @logger.warn "#{filename} is empty" if @html.empty?
19
+ @doc = Nokogiri(@html)
17
20
  end
18
-
19
21
  @encoding = encoding
20
22
  @parser = parser
21
23
  @filename = filename
22
- @results = Array.new(@parser.outputs.length) { [] }
24
+ @results = Array.new(@parser.outputs ? @parser.outputs.length : 0) { [] }
23
25
  end
24
26
 
25
27
  # Adds a row to an output
26
28
  def add_row(*row, output: 0)
27
29
  output = @parser.outputs.index(output) if output.is_a? Output
30
+ @logger.debug { "#{filename}: Adding row #{row.flatten.to_s}" }
28
31
  results[output] << row.flatten
29
32
  end
30
33
 
@@ -36,13 +39,17 @@ module DocParser
36
39
 
37
40
  # @return [String] the source of the document
38
41
  def html
39
- @html ||= @doc.inner_html #TODO: ??
42
+ @html
40
43
  end
41
44
 
42
45
  # Executes a xpath query
43
46
  def xpath(query)
44
47
  res = @doc.search(query)
45
- res.each { |el| yield el } if block_given?
48
+ if block_given?
49
+ res.each { |el| yield el }
50
+ else
51
+ res
52
+ end
46
53
  end
47
54
 
48
55
  # Executes a xpath query and returns the content
@@ -58,7 +65,7 @@ module DocParser
58
65
 
59
66
  # Matches the HTML source using a regular expression
60
67
  def regexp(regexp)
61
- html.match(regexp) rescue nil
68
+ html.match(regexp)
62
69
  end
63
70
 
64
71
  # Parses the document
@@ -70,10 +77,10 @@ module DocParser
70
77
 
71
78
  # @!visibility private
72
79
  def inspect
73
- "<Document file:'#{@filename}'>"
80
+ "<Document file:'#{@filename}', encoding:'#{@encoding}'>"
74
81
  end
75
82
 
76
- alias :css :xpath
77
- alias :css_content :xpath_content
83
+ alias_method :css, :xpath
84
+ alias_method :css_content, :xpath_content
78
85
  end
79
86
  end
@@ -11,6 +11,8 @@ module DocParser
11
11
  @filename = filename
12
12
  raise ArgumentError, 'Please specify a filename' if filename.empty?
13
13
  @file = open filename, 'w'
14
+ classname = self.class.name.split('::').last
15
+ @logger = Log4r::Logger.new("docparser::output::#{classname}")
14
16
  open_file
15
17
  end
16
18
 
@@ -30,6 +32,9 @@ module DocParser
30
32
  def close
31
33
  footer
32
34
  @file.close unless @file.closed?
35
+ @logger.info "Finished writing"
36
+ size = File.size(@filename) / 1024.0
37
+ @logger.info sprintf("%s: %d rows, %.2f KiB", @filename, rowcount, size)
33
38
  end
34
39
 
35
40
  # Called after the file is opened
@@ -44,19 +49,14 @@ module DocParser
44
49
 
45
50
  # Called when a row is added
46
51
  def write_row(row)
47
- raise 'No row writer defined'
52
+ raise NotImplementedError.new('No row writer defined')
48
53
  end
49
54
 
50
55
  # Called before closing the file
51
56
  def footer
52
57
  end
58
+ end
53
59
 
54
- # Displays information about the output
55
- # @return [String] containing number of rows and file size
56
- def summary
57
- "%s:\t%d rows, %9.2f KiB" % [@filename,
58
- @rowcount,
59
- File.size(@filename) / 1024.0]
60
- end
60
+ class MissingHeaderException < StandardError
61
61
  end
62
62
  end
@@ -5,67 +5,73 @@ module DocParser
5
5
  class HTMLOutput < Output
6
6
  # @!visibility private
7
7
  HTMLHEADER = <<-EOS
8
- <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
9
- "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
10
- <html>
11
- <head>
12
- <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
13
- <title>HTML output "#FILENAME#"</title>
14
- <style type="text/css">
15
- body {
16
- font-family:"Helvetica Neue", Helvetica, Sans-Serif;
17
- font-size:12px;
18
- }
19
- table {
20
- border:1px solid #69c;
21
- border-collapse:collapse;
8
+ <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
9
+ "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
10
+ <html>
11
+ <head>
12
+ <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
13
+ <title>HTML output "#FILENAME#"</title>
14
+ <style type="text/css">
15
+ body {
16
+ font-family:"Helvetica Neue", Helvetica, Sans-Serif;
22
17
  font-size:12px;
23
- text-align:left;
24
- width:480px;
25
- }
26
- th {
27
- border-bottom:1px dashed #69c;
28
- color:#039;
29
- font-size:14px;
30
- font-weight:normal;
31
- padding:12px 17px;
32
- }
33
- td {
34
- color:#669;
35
- padding:7px 17px;
36
- white-space: pre;
37
- }
38
- tbody tr:hover td {
39
- background:#d0dafd;
40
- color:#339;
41
- }
42
- tbody tr:nth-child(even) {
43
- background:#e0eaff;
44
- }
45
- </style>
46
- </head>
47
- <body>
48
- <table>
49
- EOS
18
+ }
19
+ table {
20
+ border:1px solid #69c;
21
+ border-collapse:collapse;
22
+ font-size:12px;
23
+ text-align:left;
24
+ width:480px;
25
+ }
26
+ th {
27
+ border-bottom:1px dashed #69c;
28
+ color:#039;
29
+ font-size:14px;
30
+ font-weight:normal;
31
+ padding:12px 17px;
32
+ }
33
+ td {
34
+ color:#669;
35
+ padding:7px 17px;
36
+ white-space: pre;
37
+ }
38
+ tbody tr:hover td {
39
+ background:#d0dafd;
40
+ color:#339;
41
+ }
42
+ tbody tr:nth-child(even) {
43
+ background:#e0eaff;
44
+ }
45
+ </style>
46
+ </head>
47
+ <body>
48
+ <table>
49
+ EOS
50
50
  # @!visibility private
51
51
  HTMLFOOTER = <<-EOS
52
- </tbody>
53
- </table>
54
- <p>#COUNT# rows</p>
55
- </body>
56
- </html>
57
- EOS
52
+ </tbody>
53
+ </table>
54
+ <p>#COUNT# rows</p>
55
+ </body>
56
+ </html>
57
+ EOS
58
58
  def open_file
59
59
  @file << HTMLHEADER.gsub('#FILENAME#', @filename)
60
60
  end
61
61
 
62
62
  def header
63
+ return if @header.nil? || @header.empty?
63
64
  @file << '<thead><tr>'
64
65
  @file << @header.map { |f| '<th>' + f + '</th>' }.join
65
66
  @file << "</tr></thead>\n<tbody>\n"
67
+ @tbody = true
66
68
  end
67
69
 
68
70
  def write_row(row)
71
+ unless @tbody
72
+ @file << "<tbody>\n"
73
+ @tbody = true
74
+ end
69
75
  @file << '<tr>'
70
76
  @file << row.map { |f| '<td>' + CGI.escapeHTML(f.to_s) + '</td>' }.join
71
77
  @file << "</tr>\n"
@@ -1,7 +1,7 @@
1
1
  require 'json'
2
2
  module DocParser
3
3
  # The JSONOutput class generates a JSON file containing all rows as seperate
4
- # JSON documents
4
+ # Array elements
5
5
  # @see Output
6
6
  class JSONOutput < Output
7
7
  # @!visibility private
@@ -12,18 +12,23 @@ module DocParser
12
12
  end
13
13
 
14
14
  def write_row(row)
15
+ raise MissingHeaderException if @header.nil? || @header.length == 0
15
16
  if @first
16
17
  @first = false
17
18
  else
18
19
  @file << ','
19
20
  end
20
21
  0.upto(@header.length - 1) do |counter|
21
- @doc[@header[counter]] = row[counter] rescue ''
22
+ if row.length > counter
23
+ @doc[@header[counter]] = row[counter]
24
+ else
25
+ @doc[@header[counter]] = ''
26
+ end
22
27
  end
23
28
  @file << JSON.dump(@doc)
24
29
  end
25
30
 
26
- def close
31
+ def footer
27
32
  @file << ']'
28
33
  end
29
34
  end
@@ -24,27 +24,23 @@ module DocParser
24
24
  @outputs << HTMLOutput.new(htmloptions)
25
25
  @outputs << YAMLOutput.new(yamloptions)
26
26
  @outputs << XLSXOutput.new(xlsxoptions)
27
- @outputs << XLSXOutput.new(jsonoptions)
27
+ @outputs << JSONOutput.new(jsonoptions)
28
28
  end
29
29
 
30
30
  def header=(row)
31
- @outputs.each { |out| out.header = row.flatten }
31
+ @outputs.each { |out| out.header = row }
32
32
  end
33
33
 
34
34
  def add_row(row)
35
- @outputs.each { |out| out.add_row row.flatten }
35
+ @outputs.each { |out| out.add_row row }
36
36
  end
37
37
 
38
38
  def rowcount
39
- @outputs.min { |out| out.rowcount }.rowcount
39
+ @outputs.map { |out| out.rowcount }.min
40
40
  end
41
41
 
42
42
  def close
43
43
  @outputs.each { |out| out.close }
44
44
  end
45
-
46
- def summary
47
- @outputs.map { |out| out.summary }.join("\n")
48
- end
49
45
  end
50
46
  end