extractula 0.0.5 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12,9 +12,13 @@ class Extractula::Extractor
12
12
  end
13
13
 
14
14
  def self.can_extract? url, html
15
- @extractable_domain ? @extractable_domain == url.domain : false
15
+ if @extractable_domain.is_a? Regexp
16
+ url.host + url.path =~ @extractable_domain
17
+ else
18
+ @extractable_domain ? @extractable_domain == url.domain : false
19
+ end
16
20
  end
17
-
21
+
18
22
  def self.media_type type = nil
19
23
  @media_type = type if type
20
24
  @media_type
@@ -22,10 +26,11 @@ class Extractula::Extractor
22
26
 
23
27
  %w{title content summary image_urls video_embed }.each do |field|
24
28
  class_eval <<-EOS
25
- def self.#{field}_path(path = nil, attrib = nil)
29
+ def self.#{field}_path(path = nil, attrib = nil, &block)
26
30
  if path
27
31
  @#{field}_path = path
28
32
  @#{field}_attr = attrib || :text
33
+ @#{field}_block = block
29
34
  end
30
35
  @#{field}_path
31
36
  end
@@ -34,6 +39,11 @@ class Extractula::Extractor
34
39
  @#{field}_attr = attrib if attrib
35
40
  @#{field}_attr
36
41
  end
42
+
43
+ def self.#{field}_block(&block)
44
+ @#{field}_block = block if block
45
+ @#{field}_block
46
+ end
37
47
 
38
48
  def #{field}_path
39
49
  self.class.#{field}_path
@@ -42,6 +52,10 @@ class Extractula::Extractor
42
52
  def #{field}_attr
43
53
  self.class.#{field}_attr
44
54
  end
55
+
56
+ def #{field}_block
57
+ self.class.#{field}_block
58
+ end
45
59
  EOS
46
60
  end
47
61
 
@@ -69,15 +83,15 @@ class Extractula::Extractor
69
83
  end
70
84
 
71
85
  def title
72
- content_at(title_path, title_attr) || content_at("//title")
86
+ content_at(title_path, title_attr, title_block) || content_at("//title")
73
87
  end
74
88
 
75
89
  def content
76
- content_at(content_path, content_attr) || extract_content
90
+ content_at(content_path, content_attr, content_block) || extract_content
77
91
  end
78
92
 
79
93
  def summary
80
- content_at(summary_path, summary_attr)
94
+ content_at(summary_path, summary_attr, summary_block)
81
95
  end
82
96
 
83
97
  def image_urls
@@ -98,10 +112,11 @@ class Extractula::Extractor
98
112
 
99
113
  private
100
114
 
101
- def content_at(path, attrib = :text)
115
+ def content_at(path, attrib = :text, block = nil)
102
116
  if path
103
117
  if node = html.at(path)
104
- attrib == :text ? node.text.strip : node[attrib].strip
118
+ value = attrib == :text ? node.text.strip : node[attrib].strip
119
+ block ? block.call(value) : value
105
120
  end
106
121
  end
107
122
  end
data/lib/extractula.rb CHANGED
@@ -9,7 +9,7 @@ require 'extractula/extracted_content'
9
9
  require 'extractula/extractor'
10
10
 
11
11
  module Extractula
12
- VERSION = "0.0.5"
12
+ VERSION = "0.0.6"
13
13
 
14
14
  @extractors = []
15
15
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: extractula
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
4
+ version: 0.0.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Paul Dix