extractula 0.0.5 → 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/extractula/extractor.rb +23 -8
- data/lib/extractula.rb +1 -1
- metadata +1 -1
data/lib/extractula/extractor.rb
CHANGED
@@ -12,9 +12,13 @@ class Extractula::Extractor
|
|
12
12
|
end
|
13
13
|
|
14
14
|
def self.can_extract? url, html
|
15
|
-
|
15
|
+
if @extractable_domain.is_a? Regexp
|
16
|
+
url.host + url.path =~ @extractable_domain
|
17
|
+
else
|
18
|
+
@extractable_domain ? @extractable_domain == url.domain : false
|
19
|
+
end
|
16
20
|
end
|
17
|
-
|
21
|
+
|
18
22
|
def self.media_type type = nil
|
19
23
|
@media_type = type if type
|
20
24
|
@media_type
|
@@ -22,10 +26,11 @@ class Extractula::Extractor
|
|
22
26
|
|
23
27
|
%w{title content summary image_urls video_embed }.each do |field|
|
24
28
|
class_eval <<-EOS
|
25
|
-
def self.#{field}_path(path = nil, attrib = nil)
|
29
|
+
def self.#{field}_path(path = nil, attrib = nil, &block)
|
26
30
|
if path
|
27
31
|
@#{field}_path = path
|
28
32
|
@#{field}_attr = attrib || :text
|
33
|
+
@#{field}_block = block
|
29
34
|
end
|
30
35
|
@#{field}_path
|
31
36
|
end
|
@@ -34,6 +39,11 @@ class Extractula::Extractor
|
|
34
39
|
@#{field}_attr = attrib if attrib
|
35
40
|
@#{field}_attr
|
36
41
|
end
|
42
|
+
|
43
|
+
def self.#{field}_block(&block)
|
44
|
+
@#{field}_block = block if block
|
45
|
+
@#{field}_block
|
46
|
+
end
|
37
47
|
|
38
48
|
def #{field}_path
|
39
49
|
self.class.#{field}_path
|
@@ -42,6 +52,10 @@ class Extractula::Extractor
|
|
42
52
|
def #{field}_attr
|
43
53
|
self.class.#{field}_attr
|
44
54
|
end
|
55
|
+
|
56
|
+
def #{field}_block
|
57
|
+
self.class.#{field}_block
|
58
|
+
end
|
45
59
|
EOS
|
46
60
|
end
|
47
61
|
|
@@ -69,15 +83,15 @@ class Extractula::Extractor
|
|
69
83
|
end
|
70
84
|
|
71
85
|
def title
|
72
|
-
content_at(title_path, title_attr) || content_at("//title")
|
86
|
+
content_at(title_path, title_attr, title_block) || content_at("//title")
|
73
87
|
end
|
74
88
|
|
75
89
|
def content
|
76
|
-
content_at(content_path, content_attr) || extract_content
|
90
|
+
content_at(content_path, content_attr, content_block) || extract_content
|
77
91
|
end
|
78
92
|
|
79
93
|
def summary
|
80
|
-
content_at(summary_path, summary_attr)
|
94
|
+
content_at(summary_path, summary_attr, summary_block)
|
81
95
|
end
|
82
96
|
|
83
97
|
def image_urls
|
@@ -98,10 +112,11 @@ class Extractula::Extractor
|
|
98
112
|
|
99
113
|
private
|
100
114
|
|
101
|
-
def content_at(path, attrib = :text)
|
115
|
+
def content_at(path, attrib = :text, block = nil)
|
102
116
|
if path
|
103
117
|
if node = html.at(path)
|
104
|
-
attrib == :text ? node.text.strip : node[attrib].strip
|
118
|
+
value = attrib == :text ? node.text.strip : node[attrib].strip
|
119
|
+
block ? block.call(value) : value
|
105
120
|
end
|
106
121
|
end
|
107
122
|
end
|
data/lib/extractula.rb
CHANGED