nibbler 1.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. data/Rakefile +3 -3
  2. data/lib/nibbler.rb +43 -31
  3. metadata +4 -3
data/Rakefile CHANGED
@@ -1,4 +1,4 @@
1
- task :default => :spec
1
+ task :default => [:loc, :spec]
2
2
 
3
3
  desc %(Run specs)
4
4
  task :spec do
@@ -13,12 +13,12 @@ task :loc do
13
13
  file.each_line do |line|
14
14
  case line
15
15
  when /^class\b/ then counting = true
16
- when /^\s*(#|\Z)/ then next
16
+ when /^\s*(#|$)/ then next
17
17
  when /^end\b/ then break
18
18
  end
19
19
  loc += 1 if counting
20
20
  end
21
21
 
22
- puts loc
22
+ puts "#{loc} lines of code"
23
23
  end
24
24
  end
@@ -1,20 +1,8 @@
1
- ## A minimalistic, declarative HTML scraper
2
-
1
+ # A minimalistic, declarative HTML scraper
3
2
  class Nibbler
4
3
  attr_reader :doc
5
4
 
6
- # Accepts string, open file, or Nokogiri-like document
7
- def initialize(doc)
8
- @doc = self.class.convert_document(doc)
9
- initialize_plural_accessors
10
- end
11
-
12
- # Initialize a new scraper and process data
13
- def self.parse(html)
14
- new(html).parse
15
- end
16
-
17
- # Specify a new singular scraping rule
5
+ # Declare a singular scraping rule
18
6
  def self.element(*args, &block)
19
7
  selector, name, delegate = parse_rule_declaration(*args, &block)
20
8
  rules[name] = [selector, delegate]
@@ -22,19 +10,27 @@ class Nibbler
22
10
  name
23
11
  end
24
12
 
25
- # Specify a new plural scraping rule
13
+ # Declare a plural scraping rule
26
14
  def self.elements(*args, &block)
27
15
  name = element(*args, &block)
28
16
  rules[name] << true
29
17
  end
30
18
 
31
- # Let it do its thing!
19
+ # Process data by creating a new scraper
20
+ def self.parse(data) new(data).parse end
21
+
22
+ # Initialize the parser with raw data or a document
23
+ def initialize(data)
24
+ @doc = self.class.convert_document(data)
25
+ # initialize plural properties
26
+ self.class.rules.each { |name, (s, k, plural)| send("#{name}=", []) if plural }
27
+ end
28
+
29
+ # Parse the document and save values returned by selectors
32
30
  def parse
33
31
  self.class.rules.each do |target, (selector, delegate, plural)|
34
32
  if plural
35
- @doc.search(selector).each do |node|
36
- send(target) << parse_result(node, delegate)
37
- end
33
+ send(target).concat @doc.search(selector).map { |i| parse_result(i, delegate) }
38
34
  else
39
35
  send("#{target}=", parse_result(@doc.at(selector), delegate))
40
36
  end
@@ -42,6 +38,16 @@ class Nibbler
42
38
  self
43
39
  end
44
40
 
41
+ # Dump the extracted data into a hash with symbolized keys
42
+ def to_hash
43
+ converter = lambda { |obj| obj.respond_to?(:to_hash) ? obj.to_hash : obj }
44
+ self.class.rules.keys.inject({}) do |hash, name|
45
+ value = send(name)
46
+ hash[name.to_sym] = Array === value ? value.map(&converter) : converter[value]
47
+ hash
48
+ end
49
+ end
50
+
45
51
  protected
46
52
 
47
53
  # `delegate` is optional, but should respond to `call` or `parse`
@@ -57,18 +63,20 @@ class Nibbler
57
63
 
58
64
  private
59
65
 
66
+ # Parsing rules declared with `element` or `elements`
60
67
  def self.rules
61
68
  @rules ||= {}
62
69
  end
63
70
 
71
+ # Make subclasses inherit the parsing rules
64
72
  def self.inherited(subclass)
65
73
  subclass.rules.update self.rules
66
74
  end
67
75
 
68
- # Rule declaration is in Hash or single argument form:
76
+ # Rule declaration forms:
69
77
  #
70
- # { '//some/selector' => :name, :with => delegate }
71
- # #=> ['//some/selector', :name, delegate]
78
+ # { 'selector' => :property, :with => delegate }
79
+ # #=> ['selector', :property, delegate]
72
80
  #
73
81
  # :title
74
82
  # #=> ['title', :title, nil]
@@ -82,18 +90,12 @@ class Nibbler
82
90
  return selector, property, delegate
83
91
  end
84
92
 
85
- def initialize_plural_accessors
86
- self.class.rules.each do |name, (s, k, plural)|
87
- send("#{name}=", []) if plural
88
- end
89
- end
90
-
93
+ # Parse data with Nokogiri unless it's already an acceptable document
91
94
  def self.convert_document(doc)
92
- unless doc.respond_to?(:at) && doc.respond_to?(:search)
95
+ if doc.respond_to?(:at) and doc.respond_to?(:search) then doc
96
+ else
93
97
  require 'nokogiri' unless defined? ::Nokogiri
94
98
  Nokogiri doc
95
- else
96
- doc
97
99
  end
98
100
  end
99
101
  end
@@ -180,6 +182,16 @@ if __FILE__ == $0
180
182
  @blog.navigation_items.should == %w[Home About Help]
181
183
  end
182
184
 
185
+ it "should convert to hash" do
186
+ hash = @blog.to_hash
187
+ hash[:navigation_items].should == %w[Home About Help]
188
+ hash[:title].should == "Maximum awesome"
189
+ article = hash[:articles].first
190
+ article[:title] == "First article"
191
+ article.key?(:link).should be_true
192
+ article[:link].should be_nil
193
+ end
194
+
183
195
  it "should have title, pubdate for first article" do
184
196
  article = @blog.articles[0]
185
197
  article.title.should == 'First article'
metadata CHANGED
@@ -1,12 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: nibbler
3
3
  version: !ruby/object:Gem::Version
4
- hash: 15
4
+ hash: 19
5
5
  prerelease: false
6
6
  segments:
7
7
  - 1
8
+ - 1
8
9
  - 0
9
- version: "1.0"
10
+ version: 1.1.0
10
11
  platform: ruby
11
12
  authors:
12
13
  - "Mislav Marohni\xC4\x87"
@@ -14,7 +15,7 @@ autorequire:
14
15
  bindir: bin
15
16
  cert_chain: []
16
17
 
17
- date: 2010-08-16 00:00:00 +02:00
18
+ date: 2010-08-30 00:00:00 +02:00
18
19
  default_executable:
19
20
  dependencies:
20
21
  - !ruby/object:Gem::Dependency