extractula 0.0.9 → 0.0.10

Sign up to get free protection for your applications and to get access to all the features.
@@ -130,7 +130,11 @@ class Extractula::Extractor
130
130
  end
131
131
 
132
132
  def extract_content
133
- candidate_nodes = html.search("//div|//p|//br").collect do |node|
133
+ fragment = content_node ? content_node.inner_html.strip : ""
134
+ end
135
+
136
+ def candidate_nodes
137
+ @candidate_nodes ||= html.search("//div|//p|//br").collect do |node|
134
138
  parent = node.parent
135
139
  if node.node_name == 'div'
136
140
  text_size = calculate_children_text_size(parent, "div")
@@ -170,9 +174,23 @@ class Extractula::Extractor
170
174
  else
171
175
  nil
172
176
  end
173
- end.compact.uniq
177
+ end.compact.uniq
178
+ end
174
179
 
175
- fragment = candidate_nodes.detect {|n| n[:text_size] > 140}[:parent].inner_html.strip rescue ""
180
+ def content_node_selector
181
+ Proc.new { |n| n[:text_size] > content_node_text_size_cutoff }
182
+ end
183
+
184
+ def content_node_text_size_cutoff
185
+ 140
186
+ end
187
+
188
+ def content_node
189
+ @content_node ||= begin
190
+ if node = candidate_nodes.detect(&content_node_selector)
191
+ node[:parent]
192
+ end
193
+ end
176
194
  end
177
195
 
178
196
  def calculate_children_text_size(parent, node_type)
@@ -86,13 +86,17 @@ module Extractula
86
86
  end
87
87
 
88
88
  def oembed_request
89
- request = "#{oembed_endpoint}?url=#{url.url}"
89
+ request = "#{oembed_endpoint}?url=#{oembed_request_url}"
90
90
  request += "&format=json" if oembed_format_param_required?
91
91
  request += "&maxwidth=#{oembed_max_width}" if oembed_max_width
92
92
  request += "&maxheight=#{oembed_max_height}" if oembed_max_height
93
93
  request
94
94
  end
95
95
 
96
+ def oembed_request_url
97
+ url.url
98
+ end
99
+
96
100
  def title
97
101
  oembed.title
98
102
  end
data/lib/extractula.rb CHANGED
@@ -9,7 +9,7 @@ require 'extractula/extracted_content'
9
9
  require 'extractula/extractor'
10
10
 
11
11
  module Extractula
12
- VERSION = "0.0.9"
12
+ VERSION = "0.0.10"
13
13
 
14
14
  @extractors = []
15
15
 
metadata CHANGED
@@ -1,7 +1,12 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: extractula
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.9
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 0
8
+ - 10
9
+ version: 0.0.10
5
10
  platform: ruby
6
11
  authors:
7
12
  - Paul Dix
@@ -15,24 +20,32 @@ default_executable:
15
20
  dependencies:
16
21
  - !ruby/object:Gem::Dependency
17
22
  name: nokogiri
18
- type: :runtime
19
- version_requirement:
20
- version_requirements: !ruby/object:Gem::Requirement
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
21
25
  requirements:
22
26
  - - ">"
23
27
  - !ruby/object:Gem::Version
28
+ segments:
29
+ - 0
30
+ - 0
31
+ - 0
24
32
  version: 0.0.0
25
- version:
33
+ type: :runtime
34
+ version_requirements: *id001
26
35
  - !ruby/object:Gem::Dependency
27
36
  name: loofah
28
- type: :runtime
29
- version_requirement:
30
- version_requirements: !ruby/object:Gem::Requirement
37
+ prerelease: false
38
+ requirement: &id002 !ruby/object:Gem::Requirement
31
39
  requirements:
32
40
  - - ">="
33
41
  - !ruby/object:Gem::Version
42
+ segments:
43
+ - 0
44
+ - 4
45
+ - 2
34
46
  version: 0.4.2
35
- version:
47
+ type: :runtime
48
+ version_requirements: *id002
36
49
  description:
37
50
  email: paul@pauldix.net
38
51
  executables: []
@@ -79,18 +92,20 @@ required_ruby_version: !ruby/object:Gem::Requirement
79
92
  requirements:
80
93
  - - ">="
81
94
  - !ruby/object:Gem::Version
95
+ segments:
96
+ - 0
82
97
  version: "0"
83
- version:
84
98
  required_rubygems_version: !ruby/object:Gem::Requirement
85
99
  requirements:
86
100
  - - ">="
87
101
  - !ruby/object:Gem::Version
102
+ segments:
103
+ - 0
88
104
  version: "0"
89
- version:
90
105
  requirements: []
91
106
 
92
107
  rubyforge_project:
93
- rubygems_version: 1.3.5
108
+ rubygems_version: 1.3.6
94
109
  signing_key:
95
110
  specification_version: 2
96
111
  summary: "Extracts content like title, summary, and images from web pages like Dracula extracts blood: with care and finesse."