omnivore 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,33 +1,100 @@
1
+ require "nokogiri"
2
+ require "omnivore/http_client"
3
+
1
4
  module Omnivore
2
- require "omnivore/http_client"
3
- require "omnivore/html_helper"
4
5
 
5
6
  class Document
6
- attr_reader :html
7
+ attr_reader :model
8
+ CONTAINER_TAGS = %w[div p]
9
+ Paragraph = Struct.new("Block", :path, :html, :text)
10
+
7
11
 
8
12
  def self.from_url(url)
9
13
  Document.new(HttpClient.get(url))
10
14
  end
11
15
 
16
+
12
17
  def self.from_html(html)
13
18
  Document.new(html)
14
19
  end
15
20
 
16
21
 
17
22
  def initialize(html)
18
- @html = html
23
+ @model = Nokogiri::HTML.parse(html) { |config|
24
+ config.options = Nokogiri::XML::ParseOptions::NOBLANKS
25
+ }
26
+ end
27
+
28
+
29
+ def to_html
30
+ self.model.to_html
19
31
  end
20
32
 
21
33
 
22
34
  def title
23
- unless @title
24
- matches = HtmlHelper.xpath(self.html, "/html/head/title")
25
- @title = HtmlHelper.to_text(matches.first) || ""
26
- end
27
- @title
35
+ @title ||= self.model.xpath("/html/head/title").text.gsub(/\s+/, " ").strip
36
+ end
37
+
38
+
39
+ def metadata
40
+ @metadata ||= self.model.xpath("//meta").inject({ }) { |memo, el|
41
+ memo[el.attr("name")] = el.attr("content") || "" if el.attr("name")
42
+ memo
43
+ }
28
44
  end
29
45
 
30
46
 
47
+ def to_text
48
+ paragraphs = self.to_paragraphs.keep_if { |p| (p.text.size / p.html.size.to_f) > 0.1 }
49
+ paragraphs.map { |p| p.text }.join("\n")
50
+ end
51
+
52
+
53
+ def to_paragraphs
54
+ filter(self.model.xpath("/html/body")).map { |block|
55
+ html = block.to_html.gsub(/\s+/, " ").strip
56
+ text = flatten(block).inject([ ]) { |memo, node|
57
+ memo << node.text.gsub(/\s+/, " ").strip if node.kind_of?(Nokogiri::XML::Text)
58
+ memo
59
+ }.join(" ")
60
+ Paragraph.new(block.path.to_s, html, text)
61
+ }
62
+ end
63
+
64
+
65
+ private
66
+
67
+ def filter(container)
68
+ elements = [ ]
69
+ container.children.each { |child|
70
+ if CONTAINER_TAGS.include?(child.name)
71
+ unless child.attr("class") =~ /comment/i
72
+ elements << child
73
+ elements += filter(child)
74
+ end
75
+ end
76
+ }
77
+ elements
78
+ end
79
+
80
+
81
+ def flatten(block)
82
+ elements = [ ]
83
+ return elements if block.nil?
84
+ return elements if block.respond_to?('cdata?') and block.cdata?
85
+ return elements if block.respond_to?('comment?') and block.comment?
86
+ if block.children.empty?
87
+ elements << block
88
+ else
89
+ block.children.each { |child|
90
+ unless %w[div p].include?(child.name)
91
+ elements += flatten(child)
92
+ end
93
+ }
94
+ end
95
+ elements
96
+ end
97
+
31
98
  end
32
99
 
33
100
  end
@@ -1,3 +1,3 @@
1
1
  module Omnivore
2
- VERSION = "0.0.2"
2
+ VERSION = "0.0.3"
3
3
  end
@@ -7,6 +7,8 @@ describe Document do
7
7
  <html>
8
8
  <head>
9
9
  <title>Nothing To See</title>
10
+ <meta name="description" content="This is a test page.">
11
+ <meta name="keywords" content="metadata, testing, kayne west">
10
12
  </head>
11
13
  <body>
12
14
  <p>Nothing to see here, move along.</p>
@@ -14,17 +16,36 @@ describe Document do
14
16
  </html>
15
17
  }
16
18
 
19
+
17
20
  it "should fetch the content of the provided url" do
18
21
  document = Document.from_url("http://www.google.com")
19
- document.html.should_not be_nil
20
- document.html.should_not be_empty
22
+ document.to_html.should_not be_empty
21
23
  end
22
24
 
23
- it "should provide the document title" do
25
+
26
+ it "should contain the document title" do
24
27
  document = Document.from_html(STATIC_HTML)
25
28
  document.title.should_not be_nil
26
29
  document.title.should_not be_empty
27
30
  document.title.should == "Nothing To See"
28
31
  end
29
32
 
33
+
34
+ it "should contain the document metadata" do
35
+ document = Document.from_html(STATIC_HTML)
36
+ document.metadata.should_not be_nil
37
+ document.metadata.should_not be_empty
38
+ document.metadata["keywords"].split(",").first.strip.should == "metadata"
39
+ end
40
+
41
+
42
+ it "should be able to extract the main content and ignore navigation and ads." do
43
+ #document = Document.from_url("http://www.marieclaire.com/career-money/jobs/thia-breen-interview")
44
+ document = Document.from_html(STATIC_HTML)
45
+ text = document.to_text
46
+ text.should_not be_nil
47
+ text.should_not be_empty
48
+ text.should == "Nothing to see here, move along."
49
+ end
50
+
30
51
  end
metadata CHANGED
@@ -2,7 +2,7 @@
2
2
  name: omnivore
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 0.0.2
5
+ version: 0.0.3
6
6
  platform: ruby
7
7
  authors:
8
8
  - Matthias Eder
@@ -10,7 +10,7 @@ autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
12
 
13
- date: 2012-01-05 00:00:00 -07:00
13
+ date: 2012-01-10 00:00:00 -07:00
14
14
  default_executable:
15
15
  dependencies:
16
16
  - !ruby/object:Gem::Dependency
@@ -52,12 +52,10 @@ files:
52
52
  - Rakefile
53
53
  - lib/omnivore.rb
54
54
  - lib/omnivore/document.rb
55
- - lib/omnivore/html_helper.rb
56
55
  - lib/omnivore/http_client.rb
57
56
  - lib/omnivore/version.rb
58
57
  - omnivore.gemspec
59
58
  - spec/document_spec.rb
60
- - spec/html_helper_spec.rb
61
59
  - spec/http_client_spec.rb
62
60
  has_rdoc: true
63
61
  homepage: ""
@@ -1,52 +0,0 @@
1
- require "nokogiri"
2
-
3
- module Omnivore
4
- module HtmlHelper
5
-
6
- class HtmlTransformer
7
-
8
- def initialize(html)
9
- @html = html
10
- end
11
-
12
- def to_text
13
- document = Nokogiri::HTML.parse(@html)
14
- partition(document, 'style', 'script').values.join(' ').strip.gsub(/\s+/, ' ')
15
- end
16
-
17
- def partition(node, *ignore_tags)
18
- elements = { }
19
- return elements if node.nil?
20
- return elements if node.respond_to?('cdata?') and node.cdata?
21
- return elements if node.respond_to?('comment?') and node.comment?
22
-
23
- if node.kind_of?(Nokogiri::XML::Element) and ignore_tags and ignore_tags.size > 0
24
- return elements if node.name =~ %r[#{ignore_tags.join('|')}]i
25
- end
26
-
27
- elements = { }
28
- if node.kind_of?(Nokogiri::XML::Text)
29
- elements[node.path.to_s] = node.text
30
- return elements
31
- end
32
- node.children.each do |child|
33
- elements.merge!(partition(child, *ignore_tags))
34
- end
35
- elements
36
- end
37
-
38
- end
39
-
40
- def HtmlHelper.to_text(html)
41
- transformer = HtmlTransformer.new(html)
42
- transformer.to_text
43
- end
44
-
45
-
46
- def HtmlHelper.xpath(html, xpath)
47
- document = Nokogiri::HTML.parse(html)
48
- document.xpath(xpath).map { |m| m.to_html }
49
- end
50
-
51
- end
52
- end
@@ -1,42 +0,0 @@
1
- require "omnivore/html_helper"
2
- include Omnivore
3
-
4
-
5
-
6
- describe HtmlHelper do
7
-
8
- it "should match the correct xpath" do
9
- content = %{
10
- <html>
11
- <head></head>
12
- <body>
13
- <div class="banner">
14
- I don't want to see this.
15
- </div>
16
- <div class="content">
17
- This is what I want to see.
18
- </div>
19
- </body>
20
- </html>
21
- }
22
- matches = HtmlHelper.xpath(content, "//div[@class=\"content\"]")
23
- matches.size.should == 1
24
- end
25
-
26
-
27
- it "should be able to extract text from markup" do
28
- html = %{
29
- <p>
30
- Content may contain some additional markup, such as:
31
- <ul>
32
- <li>Ordered or unordered lists,</li>
33
- <li><a href="#">Hyperlinks,</a></li>
34
- <li>and Images
35
- </ul>
36
- </p>
37
- }
38
- text = HtmlHelper.to_text(html)
39
- text.should == "Content may contain some additional markup, such as: Ordered or unordered lists, Hyperlinks, and Images"
40
- end
41
-
42
- end