extractula 0.0.9 → 0.0.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/extractula/extractor.rb +21 -3
- data/lib/extractula/oembed.rb +5 -1
- data/lib/extractula.rb +1 -1
- metadata +27 -12
data/lib/extractula/extractor.rb
CHANGED
@@ -130,7 +130,11 @@ class Extractula::Extractor
|
|
130
130
|
end
|
131
131
|
|
132
132
|
def extract_content
|
133
|
-
|
133
|
+
fragment = content_node ? content_node.inner_html.strip : ""
|
134
|
+
end
|
135
|
+
|
136
|
+
def candidate_nodes
|
137
|
+
@candidate_nodes ||= html.search("//div|//p|//br").collect do |node|
|
134
138
|
parent = node.parent
|
135
139
|
if node.node_name == 'div'
|
136
140
|
text_size = calculate_children_text_size(parent, "div")
|
@@ -170,9 +174,23 @@ class Extractula::Extractor
|
|
170
174
|
else
|
171
175
|
nil
|
172
176
|
end
|
173
|
-
end.compact.uniq
|
177
|
+
end.compact.uniq
|
178
|
+
end
|
174
179
|
|
175
|
-
|
180
|
+
def content_node_selector
|
181
|
+
Proc.new { |n| n[:text_size] > content_node_text_size_cutoff }
|
182
|
+
end
|
183
|
+
|
184
|
+
def content_node_text_size_cutoff
|
185
|
+
140
|
186
|
+
end
|
187
|
+
|
188
|
+
def content_node
|
189
|
+
@content_node ||= begin
|
190
|
+
if node = candidate_nodes.detect(&content_node_selector)
|
191
|
+
node[:parent]
|
192
|
+
end
|
193
|
+
end
|
176
194
|
end
|
177
195
|
|
178
196
|
def calculate_children_text_size(parent, node_type)
|
data/lib/extractula/oembed.rb
CHANGED
@@ -86,13 +86,17 @@ module Extractula
|
|
86
86
|
end
|
87
87
|
|
88
88
|
def oembed_request
|
89
|
-
request = "#{oembed_endpoint}?url=#{
|
89
|
+
request = "#{oembed_endpoint}?url=#{oembed_request_url}"
|
90
90
|
request += "&format=json" if oembed_format_param_required?
|
91
91
|
request += "&maxwidth=#{oembed_max_width}" if oembed_max_width
|
92
92
|
request += "&maxheight=#{oembed_max_height}" if oembed_max_height
|
93
93
|
request
|
94
94
|
end
|
95
95
|
|
96
|
+
def oembed_request_url
|
97
|
+
url.url
|
98
|
+
end
|
99
|
+
|
96
100
|
def title
|
97
101
|
oembed.title
|
98
102
|
end
|
data/lib/extractula.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,12 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: extractula
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 0
|
7
|
+
- 0
|
8
|
+
- 10
|
9
|
+
version: 0.0.10
|
5
10
|
platform: ruby
|
6
11
|
authors:
|
7
12
|
- Paul Dix
|
@@ -15,24 +20,32 @@ default_executable:
|
|
15
20
|
dependencies:
|
16
21
|
- !ruby/object:Gem::Dependency
|
17
22
|
name: nokogiri
|
18
|
-
|
19
|
-
|
20
|
-
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
prerelease: false
|
24
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
21
25
|
requirements:
|
22
26
|
- - ">"
|
23
27
|
- !ruby/object:Gem::Version
|
28
|
+
segments:
|
29
|
+
- 0
|
30
|
+
- 0
|
31
|
+
- 0
|
24
32
|
version: 0.0.0
|
25
|
-
|
33
|
+
type: :runtime
|
34
|
+
version_requirements: *id001
|
26
35
|
- !ruby/object:Gem::Dependency
|
27
36
|
name: loofah
|
28
|
-
|
29
|
-
|
30
|
-
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
prerelease: false
|
38
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
31
39
|
requirements:
|
32
40
|
- - ">="
|
33
41
|
- !ruby/object:Gem::Version
|
42
|
+
segments:
|
43
|
+
- 0
|
44
|
+
- 4
|
45
|
+
- 2
|
34
46
|
version: 0.4.2
|
35
|
-
|
47
|
+
type: :runtime
|
48
|
+
version_requirements: *id002
|
36
49
|
description:
|
37
50
|
email: paul@pauldix.net
|
38
51
|
executables: []
|
@@ -79,18 +92,20 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
79
92
|
requirements:
|
80
93
|
- - ">="
|
81
94
|
- !ruby/object:Gem::Version
|
95
|
+
segments:
|
96
|
+
- 0
|
82
97
|
version: "0"
|
83
|
-
version:
|
84
98
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
85
99
|
requirements:
|
86
100
|
- - ">="
|
87
101
|
- !ruby/object:Gem::Version
|
102
|
+
segments:
|
103
|
+
- 0
|
88
104
|
version: "0"
|
89
|
-
version:
|
90
105
|
requirements: []
|
91
106
|
|
92
107
|
rubyforge_project:
|
93
|
-
rubygems_version: 1.3.
|
108
|
+
rubygems_version: 1.3.6
|
94
109
|
signing_key:
|
95
110
|
specification_version: 2
|
96
111
|
summary: "Extracts content like title, summary, and images from web pages like Dracula extracts blood: with care and finesse."
|