extractula 0.0.5 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/extractula/extractor.rb +23 -8
- data/lib/extractula.rb +1 -1
- metadata +1 -1
data/lib/extractula/extractor.rb
CHANGED
@@ -12,9 +12,13 @@ class Extractula::Extractor
|
|
12
12
|
end
|
13
13
|
|
14
14
|
def self.can_extract? url, html
|
15
|
-
|
15
|
+
if @extractable_domain.is_a? Regexp
|
16
|
+
url.host + url.path =~ @extractable_domain
|
17
|
+
else
|
18
|
+
@extractable_domain ? @extractable_domain == url.domain : false
|
19
|
+
end
|
16
20
|
end
|
17
|
-
|
21
|
+
|
18
22
|
def self.media_type type = nil
|
19
23
|
@media_type = type if type
|
20
24
|
@media_type
|
@@ -22,10 +26,11 @@ class Extractula::Extractor
|
|
22
26
|
|
23
27
|
%w{title content summary image_urls video_embed }.each do |field|
|
24
28
|
class_eval <<-EOS
|
25
|
-
def self.#{field}_path(path = nil, attrib = nil)
|
29
|
+
def self.#{field}_path(path = nil, attrib = nil, &block)
|
26
30
|
if path
|
27
31
|
@#{field}_path = path
|
28
32
|
@#{field}_attr = attrib || :text
|
33
|
+
@#{field}_block = block
|
29
34
|
end
|
30
35
|
@#{field}_path
|
31
36
|
end
|
@@ -34,6 +39,11 @@ class Extractula::Extractor
|
|
34
39
|
@#{field}_attr = attrib if attrib
|
35
40
|
@#{field}_attr
|
36
41
|
end
|
42
|
+
|
43
|
+
def self.#{field}_block(&block)
|
44
|
+
@#{field}_block = block if block
|
45
|
+
@#{field}_block
|
46
|
+
end
|
37
47
|
|
38
48
|
def #{field}_path
|
39
49
|
self.class.#{field}_path
|
@@ -42,6 +52,10 @@ class Extractula::Extractor
|
|
42
52
|
def #{field}_attr
|
43
53
|
self.class.#{field}_attr
|
44
54
|
end
|
55
|
+
|
56
|
+
def #{field}_block
|
57
|
+
self.class.#{field}_block
|
58
|
+
end
|
45
59
|
EOS
|
46
60
|
end
|
47
61
|
|
@@ -69,15 +83,15 @@ class Extractula::Extractor
|
|
69
83
|
end
|
70
84
|
|
71
85
|
def title
|
72
|
-
content_at(title_path, title_attr) || content_at("//title")
|
86
|
+
content_at(title_path, title_attr, title_block) || content_at("//title")
|
73
87
|
end
|
74
88
|
|
75
89
|
def content
|
76
|
-
content_at(content_path, content_attr) || extract_content
|
90
|
+
content_at(content_path, content_attr, content_block) || extract_content
|
77
91
|
end
|
78
92
|
|
79
93
|
def summary
|
80
|
-
content_at(summary_path, summary_attr)
|
94
|
+
content_at(summary_path, summary_attr, summary_block)
|
81
95
|
end
|
82
96
|
|
83
97
|
def image_urls
|
@@ -98,10 +112,11 @@ class Extractula::Extractor
|
|
98
112
|
|
99
113
|
private
|
100
114
|
|
101
|
-
def content_at(path, attrib = :text)
|
115
|
+
def content_at(path, attrib = :text, block = nil)
|
102
116
|
if path
|
103
117
|
if node = html.at(path)
|
104
|
-
attrib == :text ? node.text.strip : node[attrib].strip
|
118
|
+
value = attrib == :text ? node.text.strip : node[attrib].strip
|
119
|
+
block ? block.call(value) : value
|
105
120
|
end
|
106
121
|
end
|
107
122
|
end
|
data/lib/extractula.rb
CHANGED