omnivore 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/omnivore/document.rb +76 -9
- data/lib/omnivore/version.rb +1 -1
- data/spec/document_spec.rb +24 -3
- metadata +2 -4
- data/lib/omnivore/html_helper.rb +0 -52
- data/spec/html_helper_spec.rb +0 -42
data/lib/omnivore/document.rb
CHANGED
@@ -1,33 +1,100 @@
|
|
1
|
+
require "nokogiri"
|
2
|
+
require "omnivore/http_client"
|
3
|
+
|
1
4
|
module Omnivore
|
2
|
-
require "omnivore/http_client"
|
3
|
-
require "omnivore/html_helper"
|
4
5
|
|
5
6
|
class Document
|
6
|
-
attr_reader :
|
7
|
+
attr_reader :model
|
8
|
+
CONTAINER_TAGS = %w[div p]
|
9
|
+
Paragraph = Struct.new("Block", :path, :html, :text)
|
10
|
+
|
7
11
|
|
8
12
|
def self.from_url(url)
|
9
13
|
Document.new(HttpClient.get(url))
|
10
14
|
end
|
11
15
|
|
16
|
+
|
12
17
|
def self.from_html(html)
|
13
18
|
Document.new(html)
|
14
19
|
end
|
15
20
|
|
16
21
|
|
17
22
|
def initialize(html)
|
18
|
-
@
|
23
|
+
@model = Nokogiri::HTML.parse(html) { |config|
|
24
|
+
config.options = Nokogiri::XML::ParseOptions::NOBLANKS
|
25
|
+
}
|
26
|
+
end
|
27
|
+
|
28
|
+
|
29
|
+
def to_html
|
30
|
+
self.model.to_html
|
19
31
|
end
|
20
32
|
|
21
33
|
|
22
34
|
def title
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
35
|
+
@title ||= self.model.xpath("/html/head/title").text.gsub(/\s+/, " ").strip
|
36
|
+
end
|
37
|
+
|
38
|
+
|
39
|
+
def metadata
|
40
|
+
@metadata ||= self.model.xpath("//meta").inject({ }) { |memo, el|
|
41
|
+
memo[el.attr("name")] = el.attr("content") || "" if el.attr("name")
|
42
|
+
memo
|
43
|
+
}
|
28
44
|
end
|
29
45
|
|
30
46
|
|
47
|
+
def to_text
|
48
|
+
paragraphs = self.to_paragraphs.keep_if { |p| (p.text.size / p.html.size.to_f) > 0.1 }
|
49
|
+
paragraphs.map { |p| p.text }.join("\n")
|
50
|
+
end
|
51
|
+
|
52
|
+
|
53
|
+
def to_paragraphs
|
54
|
+
filter(self.model.xpath("/html/body")).map { |block|
|
55
|
+
html = block.to_html.gsub(/\s+/, " ").strip
|
56
|
+
text = flatten(block).inject([ ]) { |memo, node|
|
57
|
+
memo << node.text.gsub(/\s+/, " ").strip if node.kind_of?(Nokogiri::XML::Text)
|
58
|
+
memo
|
59
|
+
}.join(" ")
|
60
|
+
Paragraph.new(block.path.to_s, html, text)
|
61
|
+
}
|
62
|
+
end
|
63
|
+
|
64
|
+
|
65
|
+
private
|
66
|
+
|
67
|
+
def filter(container)
|
68
|
+
elements = [ ]
|
69
|
+
container.children.each { |child|
|
70
|
+
if CONTAINER_TAGS.include?(child.name)
|
71
|
+
unless child.attr("class") =~ /comment/i
|
72
|
+
elements << child
|
73
|
+
elements += filter(child)
|
74
|
+
end
|
75
|
+
end
|
76
|
+
}
|
77
|
+
elements
|
78
|
+
end
|
79
|
+
|
80
|
+
|
81
|
+
def flatten(block)
|
82
|
+
elements = [ ]
|
83
|
+
return elements if block.nil?
|
84
|
+
return elements if block.respond_to?('cdata?') and block.cdata?
|
85
|
+
return elements if block.respond_to?('comment?') and block.comment?
|
86
|
+
if block.children.empty?
|
87
|
+
elements << block
|
88
|
+
else
|
89
|
+
block.children.each { |child|
|
90
|
+
unless %w[div p].include?(child.name)
|
91
|
+
elements += flatten(child)
|
92
|
+
end
|
93
|
+
}
|
94
|
+
end
|
95
|
+
elements
|
96
|
+
end
|
97
|
+
|
31
98
|
end
|
32
99
|
|
33
100
|
end
|
data/lib/omnivore/version.rb
CHANGED
data/spec/document_spec.rb
CHANGED
@@ -7,6 +7,8 @@ describe Document do
|
|
7
7
|
<html>
|
8
8
|
<head>
|
9
9
|
<title>Nothing To See</title>
|
10
|
+
<meta name="description" content="This is a test page.">
|
11
|
+
<meta name="keywords" content="metadata, testing, kayne west">
|
10
12
|
</head>
|
11
13
|
<body>
|
12
14
|
<p>Nothing to see here, move along.</p>
|
@@ -14,17 +16,36 @@ describe Document do
|
|
14
16
|
</html>
|
15
17
|
}
|
16
18
|
|
19
|
+
|
17
20
|
it "should fetch the content of the provided url" do
|
18
21
|
document = Document.from_url("http://www.google.com")
|
19
|
-
document.
|
20
|
-
document.html.should_not be_empty
|
22
|
+
document.to_html.should_not be_empty
|
21
23
|
end
|
22
24
|
|
23
|
-
|
25
|
+
|
26
|
+
it "should contain the document title" do
|
24
27
|
document = Document.from_html(STATIC_HTML)
|
25
28
|
document.title.should_not be_nil
|
26
29
|
document.title.should_not be_empty
|
27
30
|
document.title.should == "Nothing To See"
|
28
31
|
end
|
29
32
|
|
33
|
+
|
34
|
+
it "should contain the document metadata" do
|
35
|
+
document = Document.from_html(STATIC_HTML)
|
36
|
+
document.metadata.should_not be_nil
|
37
|
+
document.metadata.should_not be_empty
|
38
|
+
document.metadata["keywords"].split(",").first.strip.should == "metadata"
|
39
|
+
end
|
40
|
+
|
41
|
+
|
42
|
+
it "should be able to extract the main content and ignore navigation and ads." do
|
43
|
+
#document = Document.from_url("http://www.marieclaire.com/career-money/jobs/thia-breen-interview")
|
44
|
+
document = Document.from_html(STATIC_HTML)
|
45
|
+
text = document.to_text
|
46
|
+
text.should_not be_nil
|
47
|
+
text.should_not be_empty
|
48
|
+
text.should == "Nothing to see here, move along."
|
49
|
+
end
|
50
|
+
|
30
51
|
end
|
metadata
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
name: omnivore
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 0.0.
|
5
|
+
version: 0.0.3
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- Matthias Eder
|
@@ -10,7 +10,7 @@ autorequire:
|
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
12
|
|
13
|
-
date: 2012-01-
|
13
|
+
date: 2012-01-10 00:00:00 -07:00
|
14
14
|
default_executable:
|
15
15
|
dependencies:
|
16
16
|
- !ruby/object:Gem::Dependency
|
@@ -52,12 +52,10 @@ files:
|
|
52
52
|
- Rakefile
|
53
53
|
- lib/omnivore.rb
|
54
54
|
- lib/omnivore/document.rb
|
55
|
-
- lib/omnivore/html_helper.rb
|
56
55
|
- lib/omnivore/http_client.rb
|
57
56
|
- lib/omnivore/version.rb
|
58
57
|
- omnivore.gemspec
|
59
58
|
- spec/document_spec.rb
|
60
|
-
- spec/html_helper_spec.rb
|
61
59
|
- spec/http_client_spec.rb
|
62
60
|
has_rdoc: true
|
63
61
|
homepage: ""
|
data/lib/omnivore/html_helper.rb
DELETED
@@ -1,52 +0,0 @@
|
|
1
|
-
require "nokogiri"
|
2
|
-
|
3
|
-
module Omnivore
|
4
|
-
module HtmlHelper
|
5
|
-
|
6
|
-
class HtmlTransformer
|
7
|
-
|
8
|
-
def initialize(html)
|
9
|
-
@html = html
|
10
|
-
end
|
11
|
-
|
12
|
-
def to_text
|
13
|
-
document = Nokogiri::HTML.parse(@html)
|
14
|
-
partition(document, 'style', 'script').values.join(' ').strip.gsub(/\s+/, ' ')
|
15
|
-
end
|
16
|
-
|
17
|
-
def partition(node, *ignore_tags)
|
18
|
-
elements = { }
|
19
|
-
return elements if node.nil?
|
20
|
-
return elements if node.respond_to?('cdata?') and node.cdata?
|
21
|
-
return elements if node.respond_to?('comment?') and node.comment?
|
22
|
-
|
23
|
-
if node.kind_of?(Nokogiri::XML::Element) and ignore_tags and ignore_tags.size > 0
|
24
|
-
return elements if node.name =~ %r[#{ignore_tags.join('|')}]i
|
25
|
-
end
|
26
|
-
|
27
|
-
elements = { }
|
28
|
-
if node.kind_of?(Nokogiri::XML::Text)
|
29
|
-
elements[node.path.to_s] = node.text
|
30
|
-
return elements
|
31
|
-
end
|
32
|
-
node.children.each do |child|
|
33
|
-
elements.merge!(partition(child, *ignore_tags))
|
34
|
-
end
|
35
|
-
elements
|
36
|
-
end
|
37
|
-
|
38
|
-
end
|
39
|
-
|
40
|
-
def HtmlHelper.to_text(html)
|
41
|
-
transformer = HtmlTransformer.new(html)
|
42
|
-
transformer.to_text
|
43
|
-
end
|
44
|
-
|
45
|
-
|
46
|
-
def HtmlHelper.xpath(html, xpath)
|
47
|
-
document = Nokogiri::HTML.parse(html)
|
48
|
-
document.xpath(xpath).map { |m| m.to_html }
|
49
|
-
end
|
50
|
-
|
51
|
-
end
|
52
|
-
end
|
data/spec/html_helper_spec.rb
DELETED
@@ -1,42 +0,0 @@
|
|
1
|
-
require "omnivore/html_helper"
|
2
|
-
include Omnivore
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
describe HtmlHelper do
|
7
|
-
|
8
|
-
it "should match the correct xpath" do
|
9
|
-
content = %{
|
10
|
-
<html>
|
11
|
-
<head></head>
|
12
|
-
<body>
|
13
|
-
<div class="banner">
|
14
|
-
I don't want to see this.
|
15
|
-
</div>
|
16
|
-
<div class="content">
|
17
|
-
This is what I want to see.
|
18
|
-
</div>
|
19
|
-
</body>
|
20
|
-
</html>
|
21
|
-
}
|
22
|
-
matches = HtmlHelper.xpath(content, "//div[@class=\"content\"]")
|
23
|
-
matches.size.should == 1
|
24
|
-
end
|
25
|
-
|
26
|
-
|
27
|
-
it "should be able to extract text from markup" do
|
28
|
-
html = %{
|
29
|
-
<p>
|
30
|
-
Content may contain some additional markup, such as:
|
31
|
-
<ul>
|
32
|
-
<li>Ordered or unordered lists,</li>
|
33
|
-
<li><a href="#">Hyperlinks,</a></li>
|
34
|
-
<li>and Images
|
35
|
-
</ul>
|
36
|
-
</p>
|
37
|
-
}
|
38
|
-
text = HtmlHelper.to_text(html)
|
39
|
-
text.should == "Content may contain some additional markup, such as: Ordered or unordered lists, Hyperlinks, and Images"
|
40
|
-
end
|
41
|
-
|
42
|
-
end
|