extractula 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,6 +3,7 @@
3
3
  module Extractula
4
4
  class DinosaurComics < Extractula::Extractor
5
5
  domain 'qwantz'
6
+ media_type 'image'
6
7
  content_path 'img.comic', 'title'
7
8
  image_urls_path 'img.comic'
8
9
  end
@@ -2,7 +2,8 @@ module Extractula
2
2
  class Flickr < Extractula::Extractor
3
3
  include Extractula::OEmbed
4
4
  domain 'flickr'
5
- content_path 'div.photoDescription'
5
+ media_type 'image'
6
+ content_path 'meta[name=description]', 'content'
6
7
  oembed_endpoint 'http://www.flickr.com/services/oembed/'
7
8
  end
8
9
  end
@@ -2,6 +2,7 @@ module Extractula
2
2
  class YouTube < Extractula::Extractor
3
3
  include Extractula::OEmbed
4
4
  domain 'youtube'
5
+ media_type 'video'
5
6
  content_path '.description'
6
7
  oembed_endpoint 'http://www.youtube.com/oembed'
7
8
  end
@@ -1,5 +1,5 @@
1
1
  class Extractula::ExtractedContent
2
- attr_reader :url, :title, :content, :summary, :image_urls, :video_embed
2
+ attr_reader :url, :media_type, :title, :content, :summary, :image_urls, :video_embed
3
3
 
4
4
  def initialize(attributes = {})
5
5
  attributes.each_pair {|k, v| instance_variable_set("@#{k}", v)}
@@ -14,6 +14,11 @@ class Extractula::Extractor
14
14
  def self.can_extract? url, html
15
15
  @extractable_domain ? @extractable_domain == url.domain : false
16
16
  end
17
+
18
+ def self.media_type type = nil
19
+ @media_type = type if type
20
+ @media_type
21
+ end
17
22
 
18
23
  %w{title content summary image_urls video_embed }.each do |field|
19
24
  class_eval <<-EOS
@@ -50,6 +55,7 @@ class Extractula::Extractor
50
55
  def extract
51
56
  Extractula::ExtractedContent.new({
52
57
  :url => url.url,
58
+ :media_type => media_type,
53
59
  :title => title,
54
60
  :content => content,
55
61
  :summary => summary,
@@ -58,6 +64,10 @@ class Extractula::Extractor
58
64
  })
59
65
  end
60
66
 
67
+ def media_type
68
+ self.class.media_type || 'text'
69
+ end
70
+
61
71
  def title
62
72
  content_at(title_path, title_attr) || content_at("//title")
63
73
  end
@@ -72,7 +82,11 @@ class Extractula::Extractor
72
82
 
73
83
  def image_urls
74
84
  if image_urls_path
75
- html.search(image_urls_path).collect { |img| img['src'].strip }
85
+ html.search(image_urls_path).collect do |img|
86
+ src = img['src'].strip
87
+ src = "#{@url.scheme}://#{@url.host}#{src}" if src.start_with?('/')
88
+ src
89
+ end
76
90
  end
77
91
  end
78
92
 
data/lib/extractula.rb CHANGED
@@ -8,7 +8,7 @@ require 'extractula/extracted_content'
8
8
  require 'extractula/extractor'
9
9
 
10
10
  module Extractula
11
- VERSION = "0.0.2"
11
+ VERSION = "0.0.3"
12
12
 
13
13
  @extractors = []
14
14
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: extractula
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Paul Dix