omnivore 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,33 +1,100 @@
1
+ require "nokogiri"
2
+ require "omnivore/http_client"
3
+
1
4
  module Omnivore
2
- require "omnivore/http_client"
3
- require "omnivore/html_helper"
4
5
 
5
6
  class Document
6
- attr_reader :html
7
+ attr_reader :model
8
+ CONTAINER_TAGS = %w[div p]
9
+ Paragraph = Struct.new("Block", :path, :html, :text)
10
+
7
11
 
8
12
  def self.from_url(url)
9
13
  Document.new(HttpClient.get(url))
10
14
  end
11
15
 
16
+
12
17
  def self.from_html(html)
13
18
  Document.new(html)
14
19
  end
15
20
 
16
21
 
17
22
  def initialize(html)
18
- @html = html
23
+ @model = Nokogiri::HTML.parse(html) { |config|
24
+ config.options = Nokogiri::XML::ParseOptions::NOBLANKS
25
+ }
26
+ end
27
+
28
+
29
+ def to_html
30
+ self.model.to_html
19
31
  end
20
32
 
21
33
 
22
34
  def title
23
- unless @title
24
- matches = HtmlHelper.xpath(self.html, "/html/head/title")
25
- @title = HtmlHelper.to_text(matches.first) || ""
26
- end
27
- @title
35
+ @title ||= self.model.xpath("/html/head/title").text.gsub(/\s+/, " ").strip
36
+ end
37
+
38
+
39
+ def metadata
40
+ @metadata ||= self.model.xpath("//meta").inject({ }) { |memo, el|
41
+ memo[el.attr("name")] = el.attr("content") || "" if el.attr("name")
42
+ memo
43
+ }
28
44
  end
29
45
 
30
46
 
47
+ def to_text
48
+ paragraphs = self.to_paragraphs.keep_if { |p| (p.text.size / p.html.size.to_f) > 0.1 }
49
+ paragraphs.map { |p| p.text }.join("\n")
50
+ end
51
+
52
+
53
+ def to_paragraphs
54
+ filter(self.model.xpath("/html/body")).map { |block|
55
+ html = block.to_html.gsub(/\s+/, " ").strip
56
+ text = flatten(block).inject([ ]) { |memo, node|
57
+ memo << node.text.gsub(/\s+/, " ").strip if node.kind_of?(Nokogiri::XML::Text)
58
+ memo
59
+ }.join(" ")
60
+ Paragraph.new(block.path.to_s, html, text)
61
+ }
62
+ end
63
+
64
+
65
+ private
66
+
67
+ def filter(container)
68
+ elements = [ ]
69
+ container.children.each { |child|
70
+ if CONTAINER_TAGS.include?(child.name)
71
+ unless child.attr("class") =~ /comment/i
72
+ elements << child
73
+ elements += filter(child)
74
+ end
75
+ end
76
+ }
77
+ elements
78
+ end
79
+
80
+
81
+ def flatten(block)
82
+ elements = [ ]
83
+ return elements if block.nil?
84
+ return elements if block.respond_to?('cdata?') and block.cdata?
85
+ return elements if block.respond_to?('comment?') and block.comment?
86
+ if block.children.empty?
87
+ elements << block
88
+ else
89
+ block.children.each { |child|
90
+ unless %w[div p].include?(child.name)
91
+ elements += flatten(child)
92
+ end
93
+ }
94
+ end
95
+ elements
96
+ end
97
+
31
98
  end
32
99
 
33
100
  end
@@ -1,3 +1,3 @@
1
1
  module Omnivore
2
- VERSION = "0.0.2"
2
+ VERSION = "0.0.3"
3
3
  end
@@ -7,6 +7,8 @@ describe Document do
7
7
  <html>
8
8
  <head>
9
9
  <title>Nothing To See</title>
10
+ <meta name="description" content="This is a test page.">
11
+ <meta name="keywords" content="metadata, testing, kayne west">
10
12
  </head>
11
13
  <body>
12
14
  <p>Nothing to see here, move along.</p>
@@ -14,17 +16,36 @@ describe Document do
14
16
  </html>
15
17
  }
16
18
 
19
+
17
20
  it "should fetch the content of the provided url" do
18
21
  document = Document.from_url("http://www.google.com")
19
- document.html.should_not be_nil
20
- document.html.should_not be_empty
22
+ document.to_html.should_not be_empty
21
23
  end
22
24
 
23
- it "should provide the document title" do
25
+
26
+ it "should contain the document title" do
24
27
  document = Document.from_html(STATIC_HTML)
25
28
  document.title.should_not be_nil
26
29
  document.title.should_not be_empty
27
30
  document.title.should == "Nothing To See"
28
31
  end
29
32
 
33
+
34
+ it "should contain the document metadata" do
35
+ document = Document.from_html(STATIC_HTML)
36
+ document.metadata.should_not be_nil
37
+ document.metadata.should_not be_empty
38
+ document.metadata["keywords"].split(",").first.strip.should == "metadata"
39
+ end
40
+
41
+
42
+ it "should be able to extract the main content and ignore navigation and ads." do
43
+ #document = Document.from_url("http://www.marieclaire.com/career-money/jobs/thia-breen-interview")
44
+ document = Document.from_html(STATIC_HTML)
45
+ text = document.to_text
46
+ text.should_not be_nil
47
+ text.should_not be_empty
48
+ text.should == "Nothing to see here, move along."
49
+ end
50
+
30
51
  end
metadata CHANGED
@@ -2,7 +2,7 @@
2
2
  name: omnivore
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 0.0.2
5
+ version: 0.0.3
6
6
  platform: ruby
7
7
  authors:
8
8
  - Matthias Eder
@@ -10,7 +10,7 @@ autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
12
 
13
- date: 2012-01-05 00:00:00 -07:00
13
+ date: 2012-01-10 00:00:00 -07:00
14
14
  default_executable:
15
15
  dependencies:
16
16
  - !ruby/object:Gem::Dependency
@@ -52,12 +52,10 @@ files:
52
52
  - Rakefile
53
53
  - lib/omnivore.rb
54
54
  - lib/omnivore/document.rb
55
- - lib/omnivore/html_helper.rb
56
55
  - lib/omnivore/http_client.rb
57
56
  - lib/omnivore/version.rb
58
57
  - omnivore.gemspec
59
58
  - spec/document_spec.rb
60
- - spec/html_helper_spec.rb
61
59
  - spec/http_client_spec.rb
62
60
  has_rdoc: true
63
61
  homepage: ""
@@ -1,52 +0,0 @@
1
- require "nokogiri"
2
-
3
- module Omnivore
4
- module HtmlHelper
5
-
6
- class HtmlTransformer
7
-
8
- def initialize(html)
9
- @html = html
10
- end
11
-
12
- def to_text
13
- document = Nokogiri::HTML.parse(@html)
14
- partition(document, 'style', 'script').values.join(' ').strip.gsub(/\s+/, ' ')
15
- end
16
-
17
- def partition(node, *ignore_tags)
18
- elements = { }
19
- return elements if node.nil?
20
- return elements if node.respond_to?('cdata?') and node.cdata?
21
- return elements if node.respond_to?('comment?') and node.comment?
22
-
23
- if node.kind_of?(Nokogiri::XML::Element) and ignore_tags and ignore_tags.size > 0
24
- return elements if node.name =~ %r[#{ignore_tags.join('|')}]i
25
- end
26
-
27
- elements = { }
28
- if node.kind_of?(Nokogiri::XML::Text)
29
- elements[node.path.to_s] = node.text
30
- return elements
31
- end
32
- node.children.each do |child|
33
- elements.merge!(partition(child, *ignore_tags))
34
- end
35
- elements
36
- end
37
-
38
- end
39
-
40
- def HtmlHelper.to_text(html)
41
- transformer = HtmlTransformer.new(html)
42
- transformer.to_text
43
- end
44
-
45
-
46
- def HtmlHelper.xpath(html, xpath)
47
- document = Nokogiri::HTML.parse(html)
48
- document.xpath(xpath).map { |m| m.to_html }
49
- end
50
-
51
- end
52
- end
@@ -1,42 +0,0 @@
1
- require "omnivore/html_helper"
2
- include Omnivore
3
-
4
-
5
-
6
- describe HtmlHelper do
7
-
8
- it "should match the correct xpath" do
9
- content = %{
10
- <html>
11
- <head></head>
12
- <body>
13
- <div class="banner">
14
- I don't want to see this.
15
- </div>
16
- <div class="content">
17
- This is what I want to see.
18
- </div>
19
- </body>
20
- </html>
21
- }
22
- matches = HtmlHelper.xpath(content, "//div[@class=\"content\"]")
23
- matches.size.should == 1
24
- end
25
-
26
-
27
- it "should be able to extract text from markup" do
28
- html = %{
29
- <p>
30
- Content may contain some additional markup, such as:
31
- <ul>
32
- <li>Ordered or unordered lists,</li>
33
- <li><a href="#">Hyperlinks,</a></li>
34
- <li>and Images
35
- </ul>
36
- </p>
37
- }
38
- text = HtmlHelper.to_text(html)
39
- text.should == "Content may contain some additional markup, such as: Ordered or unordered lists, Hyperlinks, and Images"
40
- end
41
-
42
- end