omnivore 0.0.3 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/omnivore/document.rb +6 -20
- data/lib/omnivore/version.rb +1 -1
- data/spec/document_spec.rb +7 -21
- data/spec/fixtures/thia-breen-interview +2322 -0
- metadata +3 -2
data/lib/omnivore/document.rb
CHANGED
@@ -5,8 +5,8 @@ module Omnivore
|
|
5
5
|
|
6
6
|
class Document
|
7
7
|
attr_reader :model
|
8
|
-
|
9
|
-
Paragraph = Struct.new("
|
8
|
+
BLOCK_TAGS = %w[div p frame bod]
|
9
|
+
Paragraph = Struct.new("Paragraph", :path, :text, :text_density)
|
10
10
|
|
11
11
|
|
12
12
|
def self.from_url(url)
|
@@ -45,39 +45,25 @@ module Omnivore
|
|
45
45
|
|
46
46
|
|
47
47
|
def to_text
|
48
|
-
paragraphs = self.to_paragraphs.keep_if { |p|
|
48
|
+
paragraphs = self.to_paragraphs.keep_if { |p| p.text_density > 0.5 }
|
49
49
|
paragraphs.map { |p| p.text }.join("\n")
|
50
50
|
end
|
51
51
|
|
52
52
|
|
53
53
|
def to_paragraphs
|
54
|
-
|
54
|
+
self.model.xpath("//div|//p").map { |block|
|
55
55
|
html = block.to_html.gsub(/\s+/, " ").strip
|
56
56
|
text = flatten(block).inject([ ]) { |memo, node|
|
57
57
|
memo << node.text.gsub(/\s+/, " ").strip if node.kind_of?(Nokogiri::XML::Text)
|
58
58
|
memo
|
59
59
|
}.join(" ")
|
60
|
-
Paragraph.new(block.path.to_s,
|
60
|
+
Paragraph.new(block.path.to_s, text, text.size / html.size.to_f)
|
61
61
|
}
|
62
62
|
end
|
63
63
|
|
64
64
|
|
65
65
|
private
|
66
66
|
|
67
|
-
def filter(container)
|
68
|
-
elements = [ ]
|
69
|
-
container.children.each { |child|
|
70
|
-
if CONTAINER_TAGS.include?(child.name)
|
71
|
-
unless child.attr("class") =~ /comment/i
|
72
|
-
elements << child
|
73
|
-
elements += filter(child)
|
74
|
-
end
|
75
|
-
end
|
76
|
-
}
|
77
|
-
elements
|
78
|
-
end
|
79
|
-
|
80
|
-
|
81
67
|
def flatten(block)
|
82
68
|
elements = [ ]
|
83
69
|
return elements if block.nil?
|
@@ -87,7 +73,7 @@ module Omnivore
|
|
87
73
|
elements << block
|
88
74
|
else
|
89
75
|
block.children.each { |child|
|
90
|
-
unless
|
76
|
+
unless BLOCK_TAGS.include?(child.name)
|
91
77
|
elements += flatten(child)
|
92
78
|
end
|
93
79
|
}
|
data/lib/omnivore/version.rb
CHANGED
data/spec/document_spec.rb
CHANGED
@@ -2,20 +2,7 @@ require 'omnivore/document'
|
|
2
2
|
include Omnivore
|
3
3
|
|
4
4
|
describe Document do
|
5
|
-
|
6
|
-
STATIC_HTML = %{
|
7
|
-
<html>
|
8
|
-
<head>
|
9
|
-
<title>Nothing To See</title>
|
10
|
-
<meta name="description" content="This is a test page.">
|
11
|
-
<meta name="keywords" content="metadata, testing, kayne west">
|
12
|
-
</head>
|
13
|
-
<body>
|
14
|
-
<p>Nothing to see here, move along.</p>
|
15
|
-
</body>
|
16
|
-
</html>
|
17
|
-
}
|
18
|
-
|
5
|
+
html = File.open("spec/fixtures/thia-breen-interview", "r") { |f| f.readlines }.join("\n")
|
19
6
|
|
20
7
|
it "should fetch the content of the provided url" do
|
21
8
|
document = Document.from_url("http://www.google.com")
|
@@ -24,28 +11,27 @@ describe Document do
|
|
24
11
|
|
25
12
|
|
26
13
|
it "should contain the document title" do
|
27
|
-
document = Document.from_html(
|
14
|
+
document = Document.from_html(html)
|
28
15
|
document.title.should_not be_nil
|
29
16
|
document.title.should_not be_empty
|
30
|
-
document.title.should == "
|
17
|
+
document.title.should == "Estee Lauder President Thia Breen Interview - Career Advice from Thia Breen - Marie Claire"
|
18
|
+
|
31
19
|
end
|
32
20
|
|
33
21
|
|
34
22
|
it "should contain the document metadata" do
|
35
|
-
document = Document.from_html(
|
23
|
+
document = Document.from_html(html)
|
36
24
|
document.metadata.should_not be_nil
|
37
25
|
document.metadata.should_not be_empty
|
38
|
-
document.metadata["keywords"].split(",").first.strip.should == "
|
26
|
+
document.metadata["keywords"].split(",").first.strip.should == "career advice"
|
39
27
|
end
|
40
28
|
|
41
29
|
|
42
30
|
it "should be able to extract the main content and ignore navigation and ads." do
|
43
|
-
|
44
|
-
document = Document.from_html(STATIC_HTML)
|
31
|
+
document = Document.from_html(html)
|
45
32
|
text = document.to_text
|
46
33
|
text.should_not be_nil
|
47
34
|
text.should_not be_empty
|
48
|
-
text.should == "Nothing to see here, move along."
|
49
35
|
end
|
50
36
|
|
51
37
|
end
|