extractula 0.0.8 → 0.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. data/lib/extractula.rb +36 -21
  2. data/spec/extractula_spec.rb +45 -2
  3. metadata +1 -1
@@ -9,29 +9,44 @@ require 'extractula/extracted_content'
9
9
  require 'extractula/extractor'
10
10
 
11
11
  module Extractula
12
- VERSION = "0.0.8"
12
+ VERSION = "0.0.9"
13
13
 
14
14
  @extractors = []
15
15
 
16
- def self.add_extractor(extractor_class)
17
- @extractors << extractor_class
18
- end
19
-
20
- def self.remove_extractor(extractor_class)
21
- @extractors.delete extractor_class
22
- end
23
-
24
- def self.extract(url, html)
25
- parsed_url = Domainatrix.parse(url)
26
- parsed_html = Nokogiri::HTML(html)
27
- extractor = @extractors.detect {|e| e.can_extract? parsed_url, parsed_html} || Extractor
28
- extractor.new(parsed_url, parsed_html).extract
29
- end
30
-
31
- def self.custom_extractor(config = {})
32
- klass = Class.new(Extractula::Extractor)
33
- klass.include(Extractula::OEmbed) if config.delete(:oembed)
34
- config.each { |option, args| klass.__send__(option, *args) }
35
- klass
16
+ class << self
17
+
18
+ attr_reader :extractors, :last_extractor
19
+
20
+ def add_extractor(extractor_class)
21
+ @extractors << extractor_class
22
+ end
23
+
24
+ def remove_extractor(extractor_class)
25
+ @extractors.delete extractor_class
26
+ end
27
+
28
+ def extract(url, html)
29
+ parsed_url, parsed_html = Domainatrix.parse(url), Nokogiri::HTML(html)
30
+ extractor = select_extractor parsed_url, parsed_html
31
+ extractor.new(parsed_url, parsed_html).extract
32
+ end
33
+
34
+ def select_extractor url, html
35
+ @last_extractor = @extractors.detect {|e| e.can_extract? url, html} || Extractor
36
+ end
37
+
38
+ def custom_extractor(*args, &block)
39
+ config = args.last.is_a?(Hash) ? args.pop : {}
40
+ klass_name = args[0]
41
+ if block_given?
42
+ klass = Class.new Extractula::Extractor, &block
43
+ else
44
+ klass = Class.new Extractula::Extractor
45
+ klass.__send__ :include, Extractula::OEmbed if config.delete(:oembed)
46
+ config.each { |option, args| klass.__send__(option, *args) }
47
+ end
48
+ const_set klass_name, klass if klass_name
49
+ klass
50
+ end
36
51
  end
37
52
  end
@@ -12,7 +12,7 @@ describe "extractula" do
12
12
  end
13
13
  end
14
14
 
15
- Extractula.add_extractor custom_extractor
15
+ # Extractula.add_extractor custom_extractor
16
16
  content = Extractula.extract("http://pauldix.net", "some html")
17
17
  content.url.should == "custom extractor url"
18
18
  content.summary.should == "my custom extractor"
@@ -30,7 +30,7 @@ describe "extractula" do
30
30
  end
31
31
  end
32
32
 
33
- Extractula.add_extractor custom_extractor
33
+ # Extractula.add_extractor custom_extractor
34
34
  content = Extractula.extract("http://pauldix.net", "some html")
35
35
  content.url.should_not == "this url"
36
36
  content.summary.should_not == "this summary"
@@ -42,4 +42,47 @@ describe "extractula" do
42
42
  result.should be_a Extractula::ExtractedContent
43
43
  result.url.should == "http://pauldix.net"
44
44
  end
45
+
46
+ it "saves a reference to the last extractor used" do
47
+ custom_extractor = Class.new(Extractula::Extractor) do
48
+ def self.can_extract? url, html
49
+ true
50
+ end
51
+ end
52
+ Extractula.extract "http://pauldix.net", "some html"
53
+ Extractula.last_extractor.should == custom_extractor
54
+ Extractula.remove_extractor custom_extractor
55
+ end
56
+
57
+ describe "defining an inline custom extractor" do
58
+ it "takes a block form definition" do
59
+ klass = Extractula.custom_extractor do
60
+ domain 'pauldix'
61
+ content_path '#content'
62
+ end
63
+ Extractula.extractors.should include(klass)
64
+ Extractula.remove_extractor klass
65
+ end
66
+
67
+ it "takes a hash form definition" do
68
+ klass = Extractula.custom_extractor :domain => 'pauldix', :content_path => '#content'
69
+ Extractula.extractors.should include(klass)
70
+ Extractula.remove_extractor klass
71
+ end
72
+
73
+ it "can be named" do
74
+ klass = Extractula.custom_extractor :PaulDix do
75
+ domain 'pauldix'
76
+ content_path '#content'
77
+ end
78
+ Extractula.const_defined?(:PaulDix).should be_true
79
+ Extractula.remove_extractor klass
80
+ end
81
+
82
+ it "can contain the OEmbed module" do
83
+ klass = Extractula.custom_extractor :oembed => true
84
+ klass.should include(Extractula::OEmbed)
85
+ Extractula.remove_extractor klass
86
+ end
87
+ end
45
88
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: extractula
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.8
4
+ version: 0.0.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Paul Dix