extractula 0.0.9 → 0.0.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -130,7 +130,11 @@ class Extractula::Extractor
130
130
  end
131
131
 
132
132
  def extract_content
133
- candidate_nodes = html.search("//div|//p|//br").collect do |node|
133
+ fragment = content_node ? content_node.inner_html.strip : ""
134
+ end
135
+
136
+ def candidate_nodes
137
+ @candidate_nodes ||= html.search("//div|//p|//br").collect do |node|
134
138
  parent = node.parent
135
139
  if node.node_name == 'div'
136
140
  text_size = calculate_children_text_size(parent, "div")
@@ -170,9 +174,23 @@ class Extractula::Extractor
170
174
  else
171
175
  nil
172
176
  end
173
- end.compact.uniq
177
+ end.compact.uniq
178
+ end
174
179
 
175
- fragment = candidate_nodes.detect {|n| n[:text_size] > 140}[:parent].inner_html.strip rescue ""
180
+ def content_node_selector
181
+ Proc.new { |n| n[:text_size] > content_node_text_size_cutoff }
182
+ end
183
+
184
+ def content_node_text_size_cutoff
185
+ 140
186
+ end
187
+
188
+ def content_node
189
+ @content_node ||= begin
190
+ if node = candidate_nodes.detect(&content_node_selector)
191
+ node[:parent]
192
+ end
193
+ end
176
194
  end
177
195
 
178
196
  def calculate_children_text_size(parent, node_type)
@@ -86,13 +86,17 @@ module Extractula
86
86
  end
87
87
 
88
88
  def oembed_request
89
- request = "#{oembed_endpoint}?url=#{url.url}"
89
+ request = "#{oembed_endpoint}?url=#{oembed_request_url}"
90
90
  request += "&format=json" if oembed_format_param_required?
91
91
  request += "&maxwidth=#{oembed_max_width}" if oembed_max_width
92
92
  request += "&maxheight=#{oembed_max_height}" if oembed_max_height
93
93
  request
94
94
  end
95
95
 
96
+ def oembed_request_url
97
+ url.url
98
+ end
99
+
96
100
  def title
97
101
  oembed.title
98
102
  end
data/lib/extractula.rb CHANGED
@@ -9,7 +9,7 @@ require 'extractula/extracted_content'
9
9
  require 'extractula/extractor'
10
10
 
11
11
  module Extractula
12
- VERSION = "0.0.9"
12
+ VERSION = "0.0.10"
13
13
 
14
14
  @extractors = []
15
15
 
metadata CHANGED
@@ -1,7 +1,12 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: extractula
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.9
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 0
8
+ - 10
9
+ version: 0.0.10
5
10
  platform: ruby
6
11
  authors:
7
12
  - Paul Dix
@@ -15,24 +20,32 @@ default_executable:
15
20
  dependencies:
16
21
  - !ruby/object:Gem::Dependency
17
22
  name: nokogiri
18
- type: :runtime
19
- version_requirement:
20
- version_requirements: !ruby/object:Gem::Requirement
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
21
25
  requirements:
22
26
  - - ">"
23
27
  - !ruby/object:Gem::Version
28
+ segments:
29
+ - 0
30
+ - 0
31
+ - 0
24
32
  version: 0.0.0
25
- version:
33
+ type: :runtime
34
+ version_requirements: *id001
26
35
  - !ruby/object:Gem::Dependency
27
36
  name: loofah
28
- type: :runtime
29
- version_requirement:
30
- version_requirements: !ruby/object:Gem::Requirement
37
+ prerelease: false
38
+ requirement: &id002 !ruby/object:Gem::Requirement
31
39
  requirements:
32
40
  - - ">="
33
41
  - !ruby/object:Gem::Version
42
+ segments:
43
+ - 0
44
+ - 4
45
+ - 2
34
46
  version: 0.4.2
35
- version:
47
+ type: :runtime
48
+ version_requirements: *id002
36
49
  description:
37
50
  email: paul@pauldix.net
38
51
  executables: []
@@ -79,18 +92,20 @@ required_ruby_version: !ruby/object:Gem::Requirement
79
92
  requirements:
80
93
  - - ">="
81
94
  - !ruby/object:Gem::Version
95
+ segments:
96
+ - 0
82
97
  version: "0"
83
- version:
84
98
  required_rubygems_version: !ruby/object:Gem::Requirement
85
99
  requirements:
86
100
  - - ">="
87
101
  - !ruby/object:Gem::Version
102
+ segments:
103
+ - 0
88
104
  version: "0"
89
- version:
90
105
  requirements: []
91
106
 
92
107
  rubyforge_project:
93
- rubygems_version: 1.3.5
108
+ rubygems_version: 1.3.6
94
109
  signing_key:
95
110
  specification_version: 2
96
111
  summary: "Extracts content like title, summary, and images from web pages like Dracula extracts blood: with care and finesse."