extractula 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/extractula/custom_extractors/dinosaur_comics.rb +1 -0
- data/lib/extractula/custom_extractors/flickr.rb +2 -1
- data/lib/extractula/custom_extractors/you_tube.rb +1 -0
- data/lib/extractula/extracted_content.rb +1 -1
- data/lib/extractula/extractor.rb +15 -1
- data/lib/extractula.rb +1 -1
- metadata +1 -1
@@ -2,7 +2,8 @@ module Extractula
|
|
2
2
|
class Flickr < Extractula::Extractor
|
3
3
|
include Extractula::OEmbed
|
4
4
|
domain 'flickr'
|
5
|
-
|
5
|
+
media_type 'image'
|
6
|
+
content_path 'meta[name=description]', 'content'
|
6
7
|
oembed_endpoint 'http://www.flickr.com/services/oembed/'
|
7
8
|
end
|
8
9
|
end
|
@@ -1,5 +1,5 @@
|
|
1
1
|
class Extractula::ExtractedContent
|
2
|
-
attr_reader :url, :title, :content, :summary, :image_urls, :video_embed
|
2
|
+
attr_reader :url, :media_type, :title, :content, :summary, :image_urls, :video_embed
|
3
3
|
|
4
4
|
def initialize(attributes = {})
|
5
5
|
attributes.each_pair {|k, v| instance_variable_set("@#{k}", v)}
|
data/lib/extractula/extractor.rb
CHANGED
@@ -14,6 +14,11 @@ class Extractula::Extractor
|
|
14
14
|
def self.can_extract? url, html
|
15
15
|
@extractable_domain ? @extractable_domain == url.domain : false
|
16
16
|
end
|
17
|
+
|
18
|
+
def self.media_type type = nil
|
19
|
+
@media_type = type if type
|
20
|
+
@media_type
|
21
|
+
end
|
17
22
|
|
18
23
|
%w{title content summary image_urls video_embed }.each do |field|
|
19
24
|
class_eval <<-EOS
|
@@ -50,6 +55,7 @@ class Extractula::Extractor
|
|
50
55
|
def extract
|
51
56
|
Extractula::ExtractedContent.new({
|
52
57
|
:url => url.url,
|
58
|
+
:media_type => media_type,
|
53
59
|
:title => title,
|
54
60
|
:content => content,
|
55
61
|
:summary => summary,
|
@@ -58,6 +64,10 @@ class Extractula::Extractor
|
|
58
64
|
})
|
59
65
|
end
|
60
66
|
|
67
|
+
def media_type
|
68
|
+
self.class.media_type || 'text'
|
69
|
+
end
|
70
|
+
|
61
71
|
def title
|
62
72
|
content_at(title_path, title_attr) || content_at("//title")
|
63
73
|
end
|
@@ -72,7 +82,11 @@ class Extractula::Extractor
|
|
72
82
|
|
73
83
|
def image_urls
|
74
84
|
if image_urls_path
|
75
|
-
html.search(image_urls_path).collect
|
85
|
+
html.search(image_urls_path).collect do |img|
|
86
|
+
src = img['src'].strip
|
87
|
+
src = "#{@url.scheme}://#{@url.host}#{src}" if src.start_with?('/')
|
88
|
+
src
|
89
|
+
end
|
76
90
|
end
|
77
91
|
end
|
78
92
|
|
data/lib/extractula.rb
CHANGED