extractula 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -3,6 +3,7 @@
3
3
  module Extractula
4
4
  class DinosaurComics < Extractula::Extractor
5
5
  domain 'qwantz'
6
+ media_type 'image'
6
7
  content_path 'img.comic', 'title'
7
8
  image_urls_path 'img.comic'
8
9
  end
@@ -2,7 +2,8 @@ module Extractula
2
2
  class Flickr < Extractula::Extractor
3
3
  include Extractula::OEmbed
4
4
  domain 'flickr'
5
- content_path 'div.photoDescription'
5
+ media_type 'image'
6
+ content_path 'meta[name=description]', 'content'
6
7
  oembed_endpoint 'http://www.flickr.com/services/oembed/'
7
8
  end
8
9
  end
@@ -2,6 +2,7 @@ module Extractula
2
2
  class YouTube < Extractula::Extractor
3
3
  include Extractula::OEmbed
4
4
  domain 'youtube'
5
+ media_type 'video'
5
6
  content_path '.description'
6
7
  oembed_endpoint 'http://www.youtube.com/oembed'
7
8
  end
@@ -1,5 +1,5 @@
1
1
  class Extractula::ExtractedContent
2
- attr_reader :url, :title, :content, :summary, :image_urls, :video_embed
2
+ attr_reader :url, :media_type, :title, :content, :summary, :image_urls, :video_embed
3
3
 
4
4
  def initialize(attributes = {})
5
5
  attributes.each_pair {|k, v| instance_variable_set("@#{k}", v)}
@@ -14,6 +14,11 @@ class Extractula::Extractor
14
14
  def self.can_extract? url, html
15
15
  @extractable_domain ? @extractable_domain == url.domain : false
16
16
  end
17
+
18
+ def self.media_type type = nil
19
+ @media_type = type if type
20
+ @media_type
21
+ end
17
22
 
18
23
  %w{title content summary image_urls video_embed }.each do |field|
19
24
  class_eval <<-EOS
@@ -50,6 +55,7 @@ class Extractula::Extractor
50
55
  def extract
51
56
  Extractula::ExtractedContent.new({
52
57
  :url => url.url,
58
+ :media_type => media_type,
53
59
  :title => title,
54
60
  :content => content,
55
61
  :summary => summary,
@@ -58,6 +64,10 @@ class Extractula::Extractor
58
64
  })
59
65
  end
60
66
 
67
+ def media_type
68
+ self.class.media_type || 'text'
69
+ end
70
+
61
71
  def title
62
72
  content_at(title_path, title_attr) || content_at("//title")
63
73
  end
@@ -72,7 +82,11 @@ class Extractula::Extractor
72
82
 
73
83
  def image_urls
74
84
  if image_urls_path
75
- html.search(image_urls_path).collect { |img| img['src'].strip }
85
+ html.search(image_urls_path).collect do |img|
86
+ src = img['src'].strip
87
+ src = "#{@url.scheme}://#{@url.host}#{src}" if src.start_with?('/')
88
+ src
89
+ end
76
90
  end
77
91
  end
78
92
 
data/lib/extractula.rb CHANGED
@@ -8,7 +8,7 @@ require 'extractula/extracted_content'
8
8
  require 'extractula/extractor'
9
9
 
10
10
  module Extractula
11
- VERSION = "0.0.2"
11
+ VERSION = "0.0.3"
12
12
 
13
13
  @extractors = []
14
14
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: extractula
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Paul Dix