extractula 0.0.5 → 0.0.6

Sign up to get free protection for your applications and to get access to all the features.
@@ -12,9 +12,13 @@ class Extractula::Extractor
12
12
  end
13
13
 
14
14
  def self.can_extract? url, html
15
- @extractable_domain ? @extractable_domain == url.domain : false
15
+ if @extractable_domain.is_a? Regexp
16
+ url.host + url.path =~ @extractable_domain
17
+ else
18
+ @extractable_domain ? @extractable_domain == url.domain : false
19
+ end
16
20
  end
17
-
21
+
18
22
  def self.media_type type = nil
19
23
  @media_type = type if type
20
24
  @media_type
@@ -22,10 +26,11 @@ class Extractula::Extractor
22
26
 
23
27
  %w{title content summary image_urls video_embed }.each do |field|
24
28
  class_eval <<-EOS
25
- def self.#{field}_path(path = nil, attrib = nil)
29
+ def self.#{field}_path(path = nil, attrib = nil, &block)
26
30
  if path
27
31
  @#{field}_path = path
28
32
  @#{field}_attr = attrib || :text
33
+ @#{field}_block = block
29
34
  end
30
35
  @#{field}_path
31
36
  end
@@ -34,6 +39,11 @@ class Extractula::Extractor
34
39
  @#{field}_attr = attrib if attrib
35
40
  @#{field}_attr
36
41
  end
42
+
43
+ def self.#{field}_block(&block)
44
+ @#{field}_block = block if block
45
+ @#{field}_block
46
+ end
37
47
 
38
48
  def #{field}_path
39
49
  self.class.#{field}_path
@@ -42,6 +52,10 @@ class Extractula::Extractor
42
52
  def #{field}_attr
43
53
  self.class.#{field}_attr
44
54
  end
55
+
56
+ def #{field}_block
57
+ self.class.#{field}_block
58
+ end
45
59
  EOS
46
60
  end
47
61
 
@@ -69,15 +83,15 @@ class Extractula::Extractor
69
83
  end
70
84
 
71
85
  def title
72
- content_at(title_path, title_attr) || content_at("//title")
86
+ content_at(title_path, title_attr, title_block) || content_at("//title")
73
87
  end
74
88
 
75
89
  def content
76
- content_at(content_path, content_attr) || extract_content
90
+ content_at(content_path, content_attr, content_block) || extract_content
77
91
  end
78
92
 
79
93
  def summary
80
- content_at(summary_path, summary_attr)
94
+ content_at(summary_path, summary_attr, summary_block)
81
95
  end
82
96
 
83
97
  def image_urls
@@ -98,10 +112,11 @@ class Extractula::Extractor
98
112
 
99
113
  private
100
114
 
101
- def content_at(path, attrib = :text)
115
+ def content_at(path, attrib = :text, block = nil)
102
116
  if path
103
117
  if node = html.at(path)
104
- attrib == :text ? node.text.strip : node[attrib].strip
118
+ value = attrib == :text ? node.text.strip : node[attrib].strip
119
+ block ? block.call(value) : value
105
120
  end
106
121
  end
107
122
  end
data/lib/extractula.rb CHANGED
@@ -9,7 +9,7 @@ require 'extractula/extracted_content'
9
9
  require 'extractula/extractor'
10
10
 
11
11
  module Extractula
12
- VERSION = "0.0.5"
12
+ VERSION = "0.0.6"
13
13
 
14
14
  @extractors = []
15
15
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: extractula
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
4
+ version: 0.0.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Paul Dix