extractula 0.0.8 → 0.0.9
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/extractula.rb +36 -21
- data/spec/extractula_spec.rb +45 -2
- metadata +1 -1
data/lib/extractula.rb
CHANGED
@@ -9,29 +9,44 @@ require 'extractula/extracted_content'
|
|
9
9
|
require 'extractula/extractor'
|
10
10
|
|
11
11
|
module Extractula
|
12
|
-
VERSION = "0.0.
|
12
|
+
VERSION = "0.0.9"
|
13
13
|
|
14
14
|
@extractors = []
|
15
15
|
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
16
|
+
class << self
|
17
|
+
|
18
|
+
attr_reader :extractors, :last_extractor
|
19
|
+
|
20
|
+
def add_extractor(extractor_class)
|
21
|
+
@extractors << extractor_class
|
22
|
+
end
|
23
|
+
|
24
|
+
def remove_extractor(extractor_class)
|
25
|
+
@extractors.delete extractor_class
|
26
|
+
end
|
27
|
+
|
28
|
+
def extract(url, html)
|
29
|
+
parsed_url, parsed_html = Domainatrix.parse(url), Nokogiri::HTML(html)
|
30
|
+
extractor = select_extractor parsed_url, parsed_html
|
31
|
+
extractor.new(parsed_url, parsed_html).extract
|
32
|
+
end
|
33
|
+
|
34
|
+
def select_extractor url, html
|
35
|
+
@last_extractor = @extractors.detect {|e| e.can_extract? url, html} || Extractor
|
36
|
+
end
|
37
|
+
|
38
|
+
def custom_extractor(*args, &block)
|
39
|
+
config = args.last.is_a?(Hash) ? args.pop : {}
|
40
|
+
klass_name = args[0]
|
41
|
+
if block_given?
|
42
|
+
klass = Class.new Extractula::Extractor, &block
|
43
|
+
else
|
44
|
+
klass = Class.new Extractula::Extractor
|
45
|
+
klass.__send__ :include, Extractula::OEmbed if config.delete(:oembed)
|
46
|
+
config.each { |option, args| klass.__send__(option, *args) }
|
47
|
+
end
|
48
|
+
const_set klass_name, klass if klass_name
|
49
|
+
klass
|
50
|
+
end
|
36
51
|
end
|
37
52
|
end
|
data/spec/extractula_spec.rb
CHANGED
@@ -12,7 +12,7 @@ describe "extractula" do
|
|
12
12
|
end
|
13
13
|
end
|
14
14
|
|
15
|
-
Extractula.add_extractor custom_extractor
|
15
|
+
# Extractula.add_extractor custom_extractor
|
16
16
|
content = Extractula.extract("http://pauldix.net", "some html")
|
17
17
|
content.url.should == "custom extractor url"
|
18
18
|
content.summary.should == "my custom extractor"
|
@@ -30,7 +30,7 @@ describe "extractula" do
|
|
30
30
|
end
|
31
31
|
end
|
32
32
|
|
33
|
-
Extractula.add_extractor custom_extractor
|
33
|
+
# Extractula.add_extractor custom_extractor
|
34
34
|
content = Extractula.extract("http://pauldix.net", "some html")
|
35
35
|
content.url.should_not == "this url"
|
36
36
|
content.summary.should_not == "this summary"
|
@@ -42,4 +42,47 @@ describe "extractula" do
|
|
42
42
|
result.should be_a Extractula::ExtractedContent
|
43
43
|
result.url.should == "http://pauldix.net"
|
44
44
|
end
|
45
|
+
|
46
|
+
it "saves a reference to the last extractor used" do
|
47
|
+
custom_extractor = Class.new(Extractula::Extractor) do
|
48
|
+
def self.can_extract? url, html
|
49
|
+
true
|
50
|
+
end
|
51
|
+
end
|
52
|
+
Extractula.extract "http://pauldix.net", "some html"
|
53
|
+
Extractula.last_extractor.should == custom_extractor
|
54
|
+
Extractula.remove_extractor custom_extractor
|
55
|
+
end
|
56
|
+
|
57
|
+
describe "defining an inline custom extractor" do
|
58
|
+
it "takes a block form definition" do
|
59
|
+
klass = Extractula.custom_extractor do
|
60
|
+
domain 'pauldix'
|
61
|
+
content_path '#content'
|
62
|
+
end
|
63
|
+
Extractula.extractors.should include(klass)
|
64
|
+
Extractula.remove_extractor klass
|
65
|
+
end
|
66
|
+
|
67
|
+
it "takes a hash form definition" do
|
68
|
+
klass = Extractula.custom_extractor :domain => 'pauldix', :content_path => '#content'
|
69
|
+
Extractula.extractors.should include(klass)
|
70
|
+
Extractula.remove_extractor klass
|
71
|
+
end
|
72
|
+
|
73
|
+
it "can be named" do
|
74
|
+
klass = Extractula.custom_extractor :PaulDix do
|
75
|
+
domain 'pauldix'
|
76
|
+
content_path '#content'
|
77
|
+
end
|
78
|
+
Extractula.const_defined?(:PaulDix).should be_true
|
79
|
+
Extractula.remove_extractor klass
|
80
|
+
end
|
81
|
+
|
82
|
+
it "can contain the OEmbed module" do
|
83
|
+
klass = Extractula.custom_extractor :oembed => true
|
84
|
+
klass.should include(Extractula::OEmbed)
|
85
|
+
Extractula.remove_extractor klass
|
86
|
+
end
|
87
|
+
end
|
45
88
|
end
|