omnivore 0.0.3 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -5,8 +5,8 @@ module Omnivore
5
5
 
6
6
  class Document
7
7
  attr_reader :model
8
- CONTAINER_TAGS = %w[div p]
9
- Paragraph = Struct.new("Block", :path, :html, :text)
8
+ BLOCK_TAGS = %w[div p frame bod]
9
+ Paragraph = Struct.new("Paragraph", :path, :text, :text_density)
10
10
 
11
11
 
12
12
  def self.from_url(url)
@@ -45,39 +45,25 @@ module Omnivore
45
45
 
46
46
 
47
47
  def to_text
48
- paragraphs = self.to_paragraphs.keep_if { |p| (p.text.size / p.html.size.to_f) > 0.1 }
48
+ paragraphs = self.to_paragraphs.keep_if { |p| p.text_density > 0.5 }
49
49
  paragraphs.map { |p| p.text }.join("\n")
50
50
  end
51
51
 
52
52
 
53
53
  def to_paragraphs
54
- filter(self.model.xpath("/html/body")).map { |block|
54
+ self.model.xpath("//div|//p").map { |block|
55
55
  html = block.to_html.gsub(/\s+/, " ").strip
56
56
  text = flatten(block).inject([ ]) { |memo, node|
57
57
  memo << node.text.gsub(/\s+/, " ").strip if node.kind_of?(Nokogiri::XML::Text)
58
58
  memo
59
59
  }.join(" ")
60
- Paragraph.new(block.path.to_s, html, text)
60
+ Paragraph.new(block.path.to_s, text, text.size / html.size.to_f)
61
61
  }
62
62
  end
63
63
 
64
64
 
65
65
  private
66
66
 
67
- def filter(container)
68
- elements = [ ]
69
- container.children.each { |child|
70
- if CONTAINER_TAGS.include?(child.name)
71
- unless child.attr("class") =~ /comment/i
72
- elements << child
73
- elements += filter(child)
74
- end
75
- end
76
- }
77
- elements
78
- end
79
-
80
-
81
67
  def flatten(block)
82
68
  elements = [ ]
83
69
  return elements if block.nil?
@@ -87,7 +73,7 @@ module Omnivore
87
73
  elements << block
88
74
  else
89
75
  block.children.each { |child|
90
- unless %w[div p].include?(child.name)
76
+ unless BLOCK_TAGS.include?(child.name)
91
77
  elements += flatten(child)
92
78
  end
93
79
  }
@@ -1,3 +1,3 @@
1
1
  module Omnivore
2
- VERSION = "0.0.3"
2
+ VERSION = "0.0.4"
3
3
  end
@@ -2,20 +2,7 @@ require 'omnivore/document'
2
2
  include Omnivore
3
3
 
4
4
  describe Document do
5
-
6
- STATIC_HTML = %{
7
- <html>
8
- <head>
9
- <title>Nothing To See</title>
10
- <meta name="description" content="This is a test page.">
11
- <meta name="keywords" content="metadata, testing, kayne west">
12
- </head>
13
- <body>
14
- <p>Nothing to see here, move along.</p>
15
- </body>
16
- </html>
17
- }
18
-
5
+ html = File.open("spec/fixtures/thia-breen-interview", "r") { |f| f.readlines }.join("\n")
19
6
 
20
7
  it "should fetch the content of the provided url" do
21
8
  document = Document.from_url("http://www.google.com")
@@ -24,28 +11,27 @@ describe Document do
24
11
 
25
12
 
26
13
  it "should contain the document title" do
27
- document = Document.from_html(STATIC_HTML)
14
+ document = Document.from_html(html)
28
15
  document.title.should_not be_nil
29
16
  document.title.should_not be_empty
30
- document.title.should == "Nothing To See"
17
+ document.title.should == "Estee Lauder President Thia Breen Interview - Career Advice from Thia Breen - Marie Claire"
18
+
31
19
  end
32
20
 
33
21
 
34
22
  it "should contain the document metadata" do
35
- document = Document.from_html(STATIC_HTML)
23
+ document = Document.from_html(html)
36
24
  document.metadata.should_not be_nil
37
25
  document.metadata.should_not be_empty
38
- document.metadata["keywords"].split(",").first.strip.should == "metadata"
26
+ document.metadata["keywords"].split(",").first.strip.should == "career advice"
39
27
  end
40
28
 
41
29
 
42
30
  it "should be able to extract the main content and ignore navigation and ads." do
43
- #document = Document.from_url("http://www.marieclaire.com/career-money/jobs/thia-breen-interview")
44
- document = Document.from_html(STATIC_HTML)
31
+ document = Document.from_html(html)
45
32
  text = document.to_text
46
33
  text.should_not be_nil
47
34
  text.should_not be_empty
48
- text.should == "Nothing to see here, move along."
49
35
  end
50
36
 
51
37
  end