extractula 0.0.9 → 0.0.10
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/extractula/extractor.rb +21 -3
- data/lib/extractula/oembed.rb +5 -1
- data/lib/extractula.rb +1 -1
- metadata +27 -12
data/lib/extractula/extractor.rb
CHANGED
@@ -130,7 +130,11 @@ class Extractula::Extractor
|
|
130
130
|
end
|
131
131
|
|
132
132
|
def extract_content
|
133
|
-
|
133
|
+
fragment = content_node ? content_node.inner_html.strip : ""
|
134
|
+
end
|
135
|
+
|
136
|
+
def candidate_nodes
|
137
|
+
@candidate_nodes ||= html.search("//div|//p|//br").collect do |node|
|
134
138
|
parent = node.parent
|
135
139
|
if node.node_name == 'div'
|
136
140
|
text_size = calculate_children_text_size(parent, "div")
|
@@ -170,9 +174,23 @@ class Extractula::Extractor
|
|
170
174
|
else
|
171
175
|
nil
|
172
176
|
end
|
173
|
-
end.compact.uniq
|
177
|
+
end.compact.uniq
|
178
|
+
end
|
174
179
|
|
175
|
-
|
180
|
+
def content_node_selector
|
181
|
+
Proc.new { |n| n[:text_size] > content_node_text_size_cutoff }
|
182
|
+
end
|
183
|
+
|
184
|
+
def content_node_text_size_cutoff
|
185
|
+
140
|
186
|
+
end
|
187
|
+
|
188
|
+
def content_node
|
189
|
+
@content_node ||= begin
|
190
|
+
if node = candidate_nodes.detect(&content_node_selector)
|
191
|
+
node[:parent]
|
192
|
+
end
|
193
|
+
end
|
176
194
|
end
|
177
195
|
|
178
196
|
def calculate_children_text_size(parent, node_type)
|
data/lib/extractula/oembed.rb
CHANGED
@@ -86,13 +86,17 @@ module Extractula
|
|
86
86
|
end
|
87
87
|
|
88
88
|
def oembed_request
|
89
|
-
request = "#{oembed_endpoint}?url=#{
|
89
|
+
request = "#{oembed_endpoint}?url=#{oembed_request_url}"
|
90
90
|
request += "&format=json" if oembed_format_param_required?
|
91
91
|
request += "&maxwidth=#{oembed_max_width}" if oembed_max_width
|
92
92
|
request += "&maxheight=#{oembed_max_height}" if oembed_max_height
|
93
93
|
request
|
94
94
|
end
|
95
95
|
|
96
|
+
def oembed_request_url
|
97
|
+
url.url
|
98
|
+
end
|
99
|
+
|
96
100
|
def title
|
97
101
|
oembed.title
|
98
102
|
end
|
data/lib/extractula.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,12 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: extractula
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 0
|
7
|
+
- 0
|
8
|
+
- 10
|
9
|
+
version: 0.0.10
|
5
10
|
platform: ruby
|
6
11
|
authors:
|
7
12
|
- Paul Dix
|
@@ -15,24 +20,32 @@ default_executable:
|
|
15
20
|
dependencies:
|
16
21
|
- !ruby/object:Gem::Dependency
|
17
22
|
name: nokogiri
|
18
|
-
|
19
|
-
|
20
|
-
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
prerelease: false
|
24
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
21
25
|
requirements:
|
22
26
|
- - ">"
|
23
27
|
- !ruby/object:Gem::Version
|
28
|
+
segments:
|
29
|
+
- 0
|
30
|
+
- 0
|
31
|
+
- 0
|
24
32
|
version: 0.0.0
|
25
|
-
|
33
|
+
type: :runtime
|
34
|
+
version_requirements: *id001
|
26
35
|
- !ruby/object:Gem::Dependency
|
27
36
|
name: loofah
|
28
|
-
|
29
|
-
|
30
|
-
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
prerelease: false
|
38
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
31
39
|
requirements:
|
32
40
|
- - ">="
|
33
41
|
- !ruby/object:Gem::Version
|
42
|
+
segments:
|
43
|
+
- 0
|
44
|
+
- 4
|
45
|
+
- 2
|
34
46
|
version: 0.4.2
|
35
|
-
|
47
|
+
type: :runtime
|
48
|
+
version_requirements: *id002
|
36
49
|
description:
|
37
50
|
email: paul@pauldix.net
|
38
51
|
executables: []
|
@@ -79,18 +92,20 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
79
92
|
requirements:
|
80
93
|
- - ">="
|
81
94
|
- !ruby/object:Gem::Version
|
95
|
+
segments:
|
96
|
+
- 0
|
82
97
|
version: "0"
|
83
|
-
version:
|
84
98
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
85
99
|
requirements:
|
86
100
|
- - ">="
|
87
101
|
- !ruby/object:Gem::Version
|
102
|
+
segments:
|
103
|
+
- 0
|
88
104
|
version: "0"
|
89
|
-
version:
|
90
105
|
requirements: []
|
91
106
|
|
92
107
|
rubyforge_project:
|
93
|
-
rubygems_version: 1.3.
|
108
|
+
rubygems_version: 1.3.6
|
94
109
|
signing_key:
|
95
110
|
specification_version: 2
|
96
111
|
summary: "Extracts content like title, summary, and images from web pages like Dracula extracts blood: with care and finesse."
|