omnivore 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/omnivore/document.rb +6 -20
- data/lib/omnivore/version.rb +1 -1
- data/spec/document_spec.rb +7 -21
- data/spec/fixtures/thia-breen-interview +2322 -0
- metadata +3 -2
data/lib/omnivore/document.rb
CHANGED
@@ -5,8 +5,8 @@ module Omnivore
|
|
5
5
|
|
6
6
|
class Document
|
7
7
|
attr_reader :model
|
8
|
-
|
9
|
-
Paragraph = Struct.new("
|
8
|
+
BLOCK_TAGS = %w[div p frame bod]
|
9
|
+
Paragraph = Struct.new("Paragraph", :path, :text, :text_density)
|
10
10
|
|
11
11
|
|
12
12
|
def self.from_url(url)
|
@@ -45,39 +45,25 @@ module Omnivore
|
|
45
45
|
|
46
46
|
|
47
47
|
def to_text
|
48
|
-
paragraphs = self.to_paragraphs.keep_if { |p|
|
48
|
+
paragraphs = self.to_paragraphs.keep_if { |p| p.text_density > 0.5 }
|
49
49
|
paragraphs.map { |p| p.text }.join("\n")
|
50
50
|
end
|
51
51
|
|
52
52
|
|
53
53
|
def to_paragraphs
|
54
|
-
|
54
|
+
self.model.xpath("//div|//p").map { |block|
|
55
55
|
html = block.to_html.gsub(/\s+/, " ").strip
|
56
56
|
text = flatten(block).inject([ ]) { |memo, node|
|
57
57
|
memo << node.text.gsub(/\s+/, " ").strip if node.kind_of?(Nokogiri::XML::Text)
|
58
58
|
memo
|
59
59
|
}.join(" ")
|
60
|
-
Paragraph.new(block.path.to_s,
|
60
|
+
Paragraph.new(block.path.to_s, text, text.size / html.size.to_f)
|
61
61
|
}
|
62
62
|
end
|
63
63
|
|
64
64
|
|
65
65
|
private
|
66
66
|
|
67
|
-
def filter(container)
|
68
|
-
elements = [ ]
|
69
|
-
container.children.each { |child|
|
70
|
-
if CONTAINER_TAGS.include?(child.name)
|
71
|
-
unless child.attr("class") =~ /comment/i
|
72
|
-
elements << child
|
73
|
-
elements += filter(child)
|
74
|
-
end
|
75
|
-
end
|
76
|
-
}
|
77
|
-
elements
|
78
|
-
end
|
79
|
-
|
80
|
-
|
81
67
|
def flatten(block)
|
82
68
|
elements = [ ]
|
83
69
|
return elements if block.nil?
|
@@ -87,7 +73,7 @@ module Omnivore
|
|
87
73
|
elements << block
|
88
74
|
else
|
89
75
|
block.children.each { |child|
|
90
|
-
unless
|
76
|
+
unless BLOCK_TAGS.include?(child.name)
|
91
77
|
elements += flatten(child)
|
92
78
|
end
|
93
79
|
}
|
data/lib/omnivore/version.rb
CHANGED
data/spec/document_spec.rb
CHANGED
@@ -2,20 +2,7 @@ require 'omnivore/document'
|
|
2
2
|
include Omnivore
|
3
3
|
|
4
4
|
describe Document do
|
5
|
-
|
6
|
-
STATIC_HTML = %{
|
7
|
-
<html>
|
8
|
-
<head>
|
9
|
-
<title>Nothing To See</title>
|
10
|
-
<meta name="description" content="This is a test page.">
|
11
|
-
<meta name="keywords" content="metadata, testing, kayne west">
|
12
|
-
</head>
|
13
|
-
<body>
|
14
|
-
<p>Nothing to see here, move along.</p>
|
15
|
-
</body>
|
16
|
-
</html>
|
17
|
-
}
|
18
|
-
|
5
|
+
html = File.open("spec/fixtures/thia-breen-interview", "r") { |f| f.readlines }.join("\n")
|
19
6
|
|
20
7
|
it "should fetch the content of the provided url" do
|
21
8
|
document = Document.from_url("http://www.google.com")
|
@@ -24,28 +11,27 @@ describe Document do
|
|
24
11
|
|
25
12
|
|
26
13
|
it "should contain the document title" do
|
27
|
-
document = Document.from_html(
|
14
|
+
document = Document.from_html(html)
|
28
15
|
document.title.should_not be_nil
|
29
16
|
document.title.should_not be_empty
|
30
|
-
document.title.should == "
|
17
|
+
document.title.should == "Estee Lauder President Thia Breen Interview - Career Advice from Thia Breen - Marie Claire"
|
18
|
+
|
31
19
|
end
|
32
20
|
|
33
21
|
|
34
22
|
it "should contain the document metadata" do
|
35
|
-
document = Document.from_html(
|
23
|
+
document = Document.from_html(html)
|
36
24
|
document.metadata.should_not be_nil
|
37
25
|
document.metadata.should_not be_empty
|
38
|
-
document.metadata["keywords"].split(",").first.strip.should == "
|
26
|
+
document.metadata["keywords"].split(",").first.strip.should == "career advice"
|
39
27
|
end
|
40
28
|
|
41
29
|
|
42
30
|
it "should be able to extract the main content and ignore navigation and ads." do
|
43
|
-
|
44
|
-
document = Document.from_html(STATIC_HTML)
|
31
|
+
document = Document.from_html(html)
|
45
32
|
text = document.to_text
|
46
33
|
text.should_not be_nil
|
47
34
|
text.should_not be_empty
|
48
|
-
text.should == "Nothing to see here, move along."
|
49
35
|
end
|
50
36
|
|
51
37
|
end
|