docparser 0.0.1 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (81) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +2 -0
  3. data/.rubocop.yml +5 -0
  4. data/.travis.yml +3 -0
  5. data/Gemfile +9 -1
  6. data/README.md +11 -4
  7. data/Rakefile +15 -0
  8. data/example.rb +9 -7
  9. data/lib/docparser.rb +1 -0
  10. data/lib/docparser/document.rb +18 -11
  11. data/lib/docparser/output.rb +8 -8
  12. data/lib/docparser/output/html_output.rb +53 -47
  13. data/lib/docparser/output/json_output.rb +8 -3
  14. data/lib/docparser/output/multi_output.rb +4 -8
  15. data/lib/docparser/output/nil_output.rb +21 -0
  16. data/lib/docparser/output/screen_output.rb +2 -1
  17. data/lib/docparser/output/xlsx_output.rb +12 -2
  18. data/lib/docparser/output/yaml_output.rb +6 -1
  19. data/lib/docparser/parser.rb +80 -49
  20. data/lib/docparser/version.rb +1 -1
  21. data/test/lib/docparser/blackbox_test.rb +29 -0
  22. data/test/lib/docparser/document_test.rb +134 -0
  23. data/test/lib/docparser/logging_test.rb +19 -0
  24. data/test/lib/docparser/output/csv_output_test.rb +51 -0
  25. data/test/lib/docparser/output/html_output_test.rb +57 -0
  26. data/test/lib/docparser/output/json_output_test.rb +65 -0
  27. data/test/lib/docparser/output/multi_output_test.rb +80 -0
  28. data/test/lib/docparser/output/nil_output_test.rb +27 -0
  29. data/test/lib/docparser/output/screen_output_test.rb +55 -0
  30. data/test/lib/docparser/output/xlsx_output_test.rb +53 -0
  31. data/test/lib/docparser/output/yaml_output_test.rb +76 -0
  32. data/test/lib/docparser/output_test.rb +85 -0
  33. data/test/lib/docparser/parser_test.rb +197 -0
  34. data/test/lib/docparser/version_test.rb +11 -0
  35. data/test/support/hackaday/dl.rb +4 -0
  36. data/test/support/hackaday/file_1.html +716 -0
  37. data/test/support/hackaday/file_10.html +791 -0
  38. data/test/support/hackaday/file_11.html +787 -0
  39. data/test/support/hackaday/file_12.html +715 -0
  40. data/test/support/hackaday/file_13.html +793 -0
  41. data/test/support/hackaday/file_14.html +718 -0
  42. data/test/support/hackaday/file_15.html +707 -0
  43. data/test/support/hackaday/file_16.html +713 -0
  44. data/test/support/hackaday/file_17.html +715 -0
  45. data/test/support/hackaday/file_18.html +725 -0
  46. data/test/support/hackaday/file_19.html +715 -0
  47. data/test/support/hackaday/file_2.html +793 -0
  48. data/test/support/hackaday/file_20.html +795 -0
  49. data/test/support/hackaday/file_21.html +804 -0
  50. data/test/support/hackaday/file_22.html +722 -0
  51. data/test/support/hackaday/file_23.html +793 -0
  52. data/test/support/hackaday/file_24.html +717 -0
  53. data/test/support/hackaday/file_25.html +715 -0
  54. data/test/support/hackaday/file_26.html +717 -0
  55. data/test/support/hackaday/file_27.html +723 -0
  56. data/test/support/hackaday/file_28.html +711 -0
  57. data/test/support/hackaday/file_29.html +711 -0
  58. data/test/support/hackaday/file_3.html +794 -0
  59. data/test/support/hackaday/file_30.html +715 -0
  60. data/test/support/hackaday/file_31.html +713 -0
  61. data/test/support/hackaday/file_32.html +714 -0
  62. data/test/support/hackaday/file_33.html +716 -0
  63. data/test/support/hackaday/file_34.html +714 -0
  64. data/test/support/hackaday/file_35.html +792 -0
  65. data/test/support/hackaday/file_36.html +719 -0
  66. data/test/support/hackaday/file_37.html +712 -0
  67. data/test/support/hackaday/file_38.html +709 -0
  68. data/test/support/hackaday/file_39.html +808 -0
  69. data/test/support/hackaday/file_4.html +814 -0
  70. data/test/support/hackaday/file_40.html +801 -0
  71. data/test/support/hackaday/file_5.html +715 -0
  72. data/test/support/hackaday/file_6.html +792 -0
  73. data/test/support/hackaday/file_7.html +714 -0
  74. data/test/support/hackaday/file_8.html +717 -0
  75. data/test/support/hackaday/file_9.html +719 -0
  76. data/test/support/test_encoding.html +12 -0
  77. data/test/support/test_encoding2.html +12 -0
  78. data/test/support/test_html.html +16 -0
  79. data/test/support/test_xml.xml +5 -0
  80. data/test/test_helper.rb +14 -0
  81. metadata +126 -3
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 829c9585c7ac42c2496a8b385f2f19d4486a5e10
4
- data.tar.gz: 39f576740941b72e61babec808af6756adfeb1b8
3
+ metadata.gz: 0ba58a4708d78ae22fc79694754ddb70cc4fee63
4
+ data.tar.gz: cef89d6934e560633e8c3b05c8bb75e16e2c424b
5
5
  SHA512:
6
- metadata.gz: 32a9dc03bb9f413641b25cb0e51c9951fa8e1a64a48f4b1c8cc8e1e3877cbaa87b1ac70738f85c20f7383a3446d5a37915ee53a09fd30099b00be553779f4f4a
7
- data.tar.gz: b106a3fc8843a2d47f9d8c4f5ac65e39afc702fff8dcabfdfe89e0477731ac1c9576e9b795dcda53c927c8acc718846b2bb277a0e7e68654ead52cb97e612b3e
6
+ metadata.gz: e0db436a8578ca5d286c6a4946ea57a0c3ad38ed9d2db27803143fc3062c229c60dfe16ac156690d9e33b5f9041aa3bbff08b1d55f87f01467f18f97ef521d64
7
+ data.tar.gz: afca96d6dd7357fe08899d793e40ea6473e7bd9707318f93848cee8cd95c98d3ff57f6f9f8543a8b08174ceddf34d39aab83135fc242d89441d351c453bf7758
data/.gitignore CHANGED
@@ -16,3 +16,5 @@ test/tmp
16
16
  test/version_tmp
17
17
  tmp
18
18
  hackaday.*
19
+
20
+ .DS_Store
data/.rubocop.yml ADDED
@@ -0,0 +1,5 @@
1
+ # Avoid methods longer than 10 lines of code
2
+ MethodLength:
3
+ Enabled: true
4
+ CountComments: false # count full line comments?
5
+ Max: 20
data/.travis.yml ADDED
@@ -0,0 +1,3 @@
1
+ language: ruby
2
+ rvm:
3
+ - 2.0.0
data/Gemfile CHANGED
@@ -6,4 +6,12 @@ gem 'parallel'
6
6
  gem 'axlsx'
7
7
  gem 'terminal-table'
8
8
  gem 'pageme'
9
- gem "json"
9
+ gem 'json'
10
+ gem 'log4r'
11
+
12
+ group :test do
13
+ gem 'rake'
14
+ gem 'rubocop'
15
+ gem 'simplecov', :require => false
16
+ gem 'simple_mock'
17
+ end
data/README.md CHANGED
@@ -1,21 +1,27 @@
1
1
  # DocParser
2
2
 
3
- Docs: http://rubydoc.info/github/jurriaan/docparser/
3
+ [![Build Status](https://travis-ci.org/jurriaan/docparser.png?branch=master)](https://travis-ci.org/jurriaan/docparser)
4
4
 
5
5
  DocParser is a web scraping/screen scraping tool.
6
+
6
7
  You can use it to easily scrape web sites.
7
8
 
9
+ The gem is called [docparser](http://rubygems.org/gems/docparser).
10
+ You can find the documentation [here](http://rubydoc.info/github/jurriaan/docparser/).
11
+
8
12
  ## Features
9
13
 
10
14
  - XPath and CSS support through Nokogiri
11
15
  - Support for loading of URLs throug open-uri
12
16
  - Support for parallel processing of the documents
13
- - 5 Output formats:
17
+ - 6 Output formats:
14
18
  * CSV
15
19
  * XLSX
16
20
  * HTML
17
21
  * YAML
22
+ * JSON
18
23
  * Screen (for debugging and development)
24
+ * And more! (easy to extend)
19
25
 
20
26
  ## Installation
21
27
 
@@ -33,11 +39,12 @@ Or install it yourself as:
33
39
 
34
40
  ## Usage
35
41
 
36
- See example.rb
42
+ See [example.rb](https://github.com/jurriaan/docparser/blob/master/example.rb)
37
43
 
38
44
  ## Todo
39
45
 
40
46
  - Tests
47
+ - Better examples
41
48
 
42
49
  ## Contributing
43
50
 
@@ -49,4 +56,4 @@ See example.rb
49
56
 
50
57
  ## Contributors
51
58
 
52
- - Jurriaan Pruis
59
+ - [Jurriaan Pruis](https://github.com/jurriaan)
data/Rakefile CHANGED
@@ -1 +1,16 @@
1
1
  require "bundler/gem_tasks"
2
+ require 'rake/testtask'
3
+
4
+ Rake::TestTask.new do |t|
5
+ t.libs << 'lib/docparser'
6
+ t.test_files = FileList['test/lib/**/*_test.rb']
7
+ t.verbose = true
8
+ end
9
+
10
+ task test: :rubocop
11
+
12
+ task :rubocop do
13
+ sh 'rubocop'
14
+ end
15
+
16
+ task :default => :test
data/example.rb CHANGED
@@ -1,21 +1,23 @@
1
1
  #
2
-
3
- # An example of parsing a popular dutch website..
2
+ # An example of parsing hackaday.com
4
3
  # (C) 2013 Jurriaan Pruis
5
4
  #
5
+ $LOAD_PATH.unshift __dir__
6
+ require File.expand_path('lib/docparser.rb', __dir__)
7
+ require 'tmpdir'
6
8
 
7
- require 'docparser'
8
9
  include DocParser
9
- output = HTMLOutput.new filename: 'hackaday.html'
10
+ output = MultiOutput.new(filename: 'hackaday')
10
11
  output.header = 'Title', 'Author', 'Publication date', 'URL', 'Summary'
11
- parser = Parser.new(files: (1..20).map {|i| "http://hackaday.com/page/#{i}/"}, parallel: false, output: output)
12
+ files = Dir[File.join(__dir__, 'test/support/hackaday/*.html')]
13
+ parser = Parser.new(files: files, parallel: false, output: output)
12
14
  parser.parse! do
13
15
  css('#content .post') do |post|
14
16
  title_el = post.search('.entry-title a').first
15
17
  title = title_el.content
16
- author =post.search('.post-info .author .fn a').first.content
18
+ author = post.search('.post-info .author .fn a').first.content
17
19
  published_time = post.search('.post-info .date.published').first.content
18
- url = title_el.attributes['href']
20
+ url = title_el.attributes['href'].value
19
21
  summary = post.search('.entry-content').first.content.strip
20
22
  add_row title, author, published_time, url, summary
21
23
  end
data/lib/docparser.rb CHANGED
@@ -1 +1,2 @@
1
+ $LOAD_PATH.unshift __dir__
1
2
  require 'docparser/parser'
@@ -5,26 +5,29 @@ module DocParser
5
5
  # @see Output
6
6
  class Document
7
7
  attr_reader :filename, :doc, :encoding, :results
8
- def initialize(filename, encoding: 'utf-8', parser: nil)
8
+ def initialize(filename: nil, encoding: 'utf-8', parser: nil)
9
9
  if encoding == 'utf-8'
10
10
  encodingstring = 'r:utf-8'
11
11
  else
12
12
  encodingstring = "r:#{encoding}:utf-8"
13
13
  end
14
-
14
+ @logger = Log4r::Logger.new('docparser::document')
15
+ @logger.debug { "Parsing #{filename}" }
15
16
  open(filename, encodingstring) do |f|
16
- @doc = Nokogiri::HTML(f)
17
+ @html = f.read
18
+ @logger.warn "#{filename} is empty" if @html.empty?
19
+ @doc = Nokogiri(@html)
17
20
  end
18
-
19
21
  @encoding = encoding
20
22
  @parser = parser
21
23
  @filename = filename
22
- @results = Array.new(@parser.outputs.length) { [] }
24
+ @results = Array.new(@parser.outputs ? @parser.outputs.length : 0) { [] }
23
25
  end
24
26
 
25
27
  # Adds a row to an output
26
28
  def add_row(*row, output: 0)
27
29
  output = @parser.outputs.index(output) if output.is_a? Output
30
+ @logger.debug { "#{filename}: Adding row #{row.flatten.to_s}" }
28
31
  results[output] << row.flatten
29
32
  end
30
33
 
@@ -36,13 +39,17 @@ module DocParser
36
39
 
37
40
  # @return [String] the source of the document
38
41
  def html
39
- @html ||= @doc.inner_html #TODO: ??
42
+ @html
40
43
  end
41
44
 
42
45
  # Executes a xpath query
43
46
  def xpath(query)
44
47
  res = @doc.search(query)
45
- res.each { |el| yield el } if block_given?
48
+ if block_given?
49
+ res.each { |el| yield el }
50
+ else
51
+ res
52
+ end
46
53
  end
47
54
 
48
55
  # Executes a xpath query and returns the content
@@ -58,7 +65,7 @@ module DocParser
58
65
 
59
66
  # Matches the HTML source using a regular expression
60
67
  def regexp(regexp)
61
- html.match(regexp) rescue nil
68
+ html.match(regexp)
62
69
  end
63
70
 
64
71
  # Parses the document
@@ -70,10 +77,10 @@ module DocParser
70
77
 
71
78
  # @!visibility private
72
79
  def inspect
73
- "<Document file:'#{@filename}'>"
80
+ "<Document file:'#{@filename}', encoding:'#{@encoding}'>"
74
81
  end
75
82
 
76
- alias :css :xpath
77
- alias :css_content :xpath_content
83
+ alias_method :css, :xpath
84
+ alias_method :css_content, :xpath_content
78
85
  end
79
86
  end
@@ -11,6 +11,8 @@ module DocParser
11
11
  @filename = filename
12
12
  raise ArgumentError, 'Please specify a filename' if filename.empty?
13
13
  @file = open filename, 'w'
14
+ classname = self.class.name.split('::').last
15
+ @logger = Log4r::Logger.new("docparser::output::#{classname}")
14
16
  open_file
15
17
  end
16
18
 
@@ -30,6 +32,9 @@ module DocParser
30
32
  def close
31
33
  footer
32
34
  @file.close unless @file.closed?
35
+ @logger.info "Finished writing"
36
+ size = File.size(@filename) / 1024.0
37
+ @logger.info sprintf("%s: %d rows, %.2f KiB", @filename, rowcount, size)
33
38
  end
34
39
 
35
40
  # Called after the file is opened
@@ -44,19 +49,14 @@ module DocParser
44
49
 
45
50
  # Called when a row is added
46
51
  def write_row(row)
47
- raise 'No row writer defined'
52
+ raise NotImplementedError.new('No row writer defined')
48
53
  end
49
54
 
50
55
  # Called before closing the file
51
56
  def footer
52
57
  end
58
+ end
53
59
 
54
- # Displays information about the output
55
- # @return [String] containing number of rows and file size
56
- def summary
57
- "%s:\t%d rows, %9.2f KiB" % [@filename,
58
- @rowcount,
59
- File.size(@filename) / 1024.0]
60
- end
60
+ class MissingHeaderException < StandardError
61
61
  end
62
62
  end
@@ -5,67 +5,73 @@ module DocParser
5
5
  class HTMLOutput < Output
6
6
  # @!visibility private
7
7
  HTMLHEADER = <<-EOS
8
- <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
9
- "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
10
- <html>
11
- <head>
12
- <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
13
- <title>HTML output "#FILENAME#"</title>
14
- <style type="text/css">
15
- body {
16
- font-family:"Helvetica Neue", Helvetica, Sans-Serif;
17
- font-size:12px;
18
- }
19
- table {
20
- border:1px solid #69c;
21
- border-collapse:collapse;
8
+ <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
9
+ "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
10
+ <html>
11
+ <head>
12
+ <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
13
+ <title>HTML output "#FILENAME#"</title>
14
+ <style type="text/css">
15
+ body {
16
+ font-family:"Helvetica Neue", Helvetica, Sans-Serif;
22
17
  font-size:12px;
23
- text-align:left;
24
- width:480px;
25
- }
26
- th {
27
- border-bottom:1px dashed #69c;
28
- color:#039;
29
- font-size:14px;
30
- font-weight:normal;
31
- padding:12px 17px;
32
- }
33
- td {
34
- color:#669;
35
- padding:7px 17px;
36
- white-space: pre;
37
- }
38
- tbody tr:hover td {
39
- background:#d0dafd;
40
- color:#339;
41
- }
42
- tbody tr:nth-child(even) {
43
- background:#e0eaff;
44
- }
45
- </style>
46
- </head>
47
- <body>
48
- <table>
49
- EOS
18
+ }
19
+ table {
20
+ border:1px solid #69c;
21
+ border-collapse:collapse;
22
+ font-size:12px;
23
+ text-align:left;
24
+ width:480px;
25
+ }
26
+ th {
27
+ border-bottom:1px dashed #69c;
28
+ color:#039;
29
+ font-size:14px;
30
+ font-weight:normal;
31
+ padding:12px 17px;
32
+ }
33
+ td {
34
+ color:#669;
35
+ padding:7px 17px;
36
+ white-space: pre;
37
+ }
38
+ tbody tr:hover td {
39
+ background:#d0dafd;
40
+ color:#339;
41
+ }
42
+ tbody tr:nth-child(even) {
43
+ background:#e0eaff;
44
+ }
45
+ </style>
46
+ </head>
47
+ <body>
48
+ <table>
49
+ EOS
50
50
  # @!visibility private
51
51
  HTMLFOOTER = <<-EOS
52
- </tbody>
53
- </table>
54
- <p>#COUNT# rows</p>
55
- </body>
56
- </html>
57
- EOS
52
+ </tbody>
53
+ </table>
54
+ <p>#COUNT# rows</p>
55
+ </body>
56
+ </html>
57
+ EOS
58
58
  def open_file
59
59
  @file << HTMLHEADER.gsub('#FILENAME#', @filename)
60
60
  end
61
61
 
62
62
  def header
63
+ return if @header.nil? || @header.empty?
63
64
  @file << '<thead><tr>'
64
65
  @file << @header.map { |f| '<th>' + f + '</th>' }.join
65
66
  @file << "</tr></thead>\n<tbody>\n"
67
+ @tbody = true
66
68
  end
67
69
 
68
70
  def write_row(row)
71
+ unless @tbody
72
+ @file << "<tbody>\n"
73
+ @tbody = true
74
+ end
69
75
  @file << '<tr>'
70
76
  @file << row.map { |f| '<td>' + CGI.escapeHTML(f.to_s) + '</td>' }.join
71
77
  @file << "</tr>\n"
@@ -1,7 +1,7 @@
1
1
  require 'json'
2
2
  module DocParser
3
3
  # The JSONOutput class generates a JSON file containing all rows as seperate
4
- # JSON documents
4
+ # Array elements
5
5
  # @see Output
6
6
  class JSONOutput < Output
7
7
  # @!visibility private
@@ -12,18 +12,23 @@ module DocParser
12
12
  end
13
13
 
14
14
  def write_row(row)
15
+ raise MissingHeaderException if @header.nil? || @header.length == 0
15
16
  if @first
16
17
  @first = false
17
18
  else
18
19
  @file << ','
19
20
  end
20
21
  0.upto(@header.length - 1) do |counter|
21
- @doc[@header[counter]] = row[counter] rescue ''
22
+ if row.length > counter
23
+ @doc[@header[counter]] = row[counter]
24
+ else
25
+ @doc[@header[counter]] = ''
26
+ end
22
27
  end
23
28
  @file << JSON.dump(@doc)
24
29
  end
25
30
 
26
- def close
31
+ def footer
27
32
  @file << ']'
28
33
  end
29
34
  end
@@ -24,27 +24,23 @@ module DocParser
24
24
  @outputs << HTMLOutput.new(htmloptions)
25
25
  @outputs << YAMLOutput.new(yamloptions)
26
26
  @outputs << XLSXOutput.new(xlsxoptions)
27
- @outputs << XLSXOutput.new(jsonoptions)
27
+ @outputs << JSONOutput.new(jsonoptions)
28
28
  end
29
29
 
30
30
  def header=(row)
31
- @outputs.each { |out| out.header = row.flatten }
31
+ @outputs.each { |out| out.header = row }
32
32
  end
33
33
 
34
34
  def add_row(row)
35
- @outputs.each { |out| out.add_row row.flatten }
35
+ @outputs.each { |out| out.add_row row }
36
36
  end
37
37
 
38
38
  def rowcount
39
- @outputs.min { |out| out.rowcount }.rowcount
39
+ @outputs.map { |out| out.rowcount }.min
40
40
  end
41
41
 
42
42
  def close
43
43
  @outputs.each { |out| out.close }
44
44
  end
45
-
46
- def summary
47
- @outputs.map { |out| out.summary }.join("\n")
48
- end
49
45
  end
50
46
  end