omnivore 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,8 +5,8 @@ module Omnivore
5
5
 
6
6
  class Document
7
7
  attr_reader :model
8
- CONTAINER_TAGS = %w[div p]
9
- Paragraph = Struct.new("Block", :path, :html, :text)
8
+ BLOCK_TAGS = %w[div p frame bod]
9
+ Paragraph = Struct.new("Paragraph", :path, :text, :text_density)
10
10
 
11
11
 
12
12
  def self.from_url(url)
@@ -45,39 +45,25 @@ module Omnivore
45
45
 
46
46
 
47
47
  def to_text
48
- paragraphs = self.to_paragraphs.keep_if { |p| (p.text.size / p.html.size.to_f) > 0.1 }
48
+ paragraphs = self.to_paragraphs.keep_if { |p| p.text_density > 0.5 }
49
49
  paragraphs.map { |p| p.text }.join("\n")
50
50
  end
51
51
 
52
52
 
53
53
  def to_paragraphs
54
- filter(self.model.xpath("/html/body")).map { |block|
54
+ self.model.xpath("//div|//p").map { |block|
55
55
  html = block.to_html.gsub(/\s+/, " ").strip
56
56
  text = flatten(block).inject([ ]) { |memo, node|
57
57
  memo << node.text.gsub(/\s+/, " ").strip if node.kind_of?(Nokogiri::XML::Text)
58
58
  memo
59
59
  }.join(" ")
60
- Paragraph.new(block.path.to_s, html, text)
60
+ Paragraph.new(block.path.to_s, text, text.size / html.size.to_f)
61
61
  }
62
62
  end
63
63
 
64
64
 
65
65
  private
66
66
 
67
- def filter(container)
68
- elements = [ ]
69
- container.children.each { |child|
70
- if CONTAINER_TAGS.include?(child.name)
71
- unless child.attr("class") =~ /comment/i
72
- elements << child
73
- elements += filter(child)
74
- end
75
- end
76
- }
77
- elements
78
- end
79
-
80
-
81
67
  def flatten(block)
82
68
  elements = [ ]
83
69
  return elements if block.nil?
@@ -87,7 +73,7 @@ module Omnivore
87
73
  elements << block
88
74
  else
89
75
  block.children.each { |child|
90
- unless %w[div p].include?(child.name)
76
+ unless BLOCK_TAGS.include?(child.name)
91
77
  elements += flatten(child)
92
78
  end
93
79
  }
@@ -1,3 +1,3 @@
1
1
  module Omnivore
2
- VERSION = "0.0.3"
2
+ VERSION = "0.0.4"
3
3
  end
@@ -2,20 +2,7 @@ require 'omnivore/document'
2
2
  include Omnivore
3
3
 
4
4
  describe Document do
5
-
6
- STATIC_HTML = %{
7
- <html>
8
- <head>
9
- <title>Nothing To See</title>
10
- <meta name="description" content="This is a test page.">
11
- <meta name="keywords" content="metadata, testing, kayne west">
12
- </head>
13
- <body>
14
- <p>Nothing to see here, move along.</p>
15
- </body>
16
- </html>
17
- }
18
-
5
+ html = File.open("spec/fixtures/thia-breen-interview", "r") { |f| f.readlines }.join("\n")
19
6
 
20
7
  it "should fetch the content of the provided url" do
21
8
  document = Document.from_url("http://www.google.com")
@@ -24,28 +11,27 @@ describe Document do
24
11
 
25
12
 
26
13
  it "should contain the document title" do
27
- document = Document.from_html(STATIC_HTML)
14
+ document = Document.from_html(html)
28
15
  document.title.should_not be_nil
29
16
  document.title.should_not be_empty
30
- document.title.should == "Nothing To See"
17
+ document.title.should == "Estee Lauder President Thia Breen Interview - Career Advice from Thia Breen - Marie Claire"
18
+
31
19
  end
32
20
 
33
21
 
34
22
  it "should contain the document metadata" do
35
- document = Document.from_html(STATIC_HTML)
23
+ document = Document.from_html(html)
36
24
  document.metadata.should_not be_nil
37
25
  document.metadata.should_not be_empty
38
- document.metadata["keywords"].split(",").first.strip.should == "metadata"
26
+ document.metadata["keywords"].split(",").first.strip.should == "career advice"
39
27
  end
40
28
 
41
29
 
42
30
  it "should be able to extract the main content and ignore navigation and ads." do
43
- #document = Document.from_url("http://www.marieclaire.com/career-money/jobs/thia-breen-interview")
44
- document = Document.from_html(STATIC_HTML)
31
+ document = Document.from_html(html)
45
32
  text = document.to_text
46
33
  text.should_not be_nil
47
34
  text.should_not be_empty
48
- text.should == "Nothing to see here, move along."
49
35
  end
50
36
 
51
37
  end