nibbler 1.0 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. data/Rakefile +3 -3
  2. data/lib/nibbler.rb +43 -31
  3. metadata +4 -3
data/Rakefile CHANGED
@@ -1,4 +1,4 @@
1
- task :default => :spec
1
+ task :default => [:loc, :spec]
2
2
 
3
3
  desc %(Run specs)
4
4
  task :spec do
@@ -13,12 +13,12 @@ task :loc do
13
13
  file.each_line do |line|
14
14
  case line
15
15
  when /^class\b/ then counting = true
16
- when /^\s*(#|\Z)/ then next
16
+ when /^\s*(#|$)/ then next
17
17
  when /^end\b/ then break
18
18
  end
19
19
  loc += 1 if counting
20
20
  end
21
21
 
22
- puts loc
22
+ puts "#{loc} lines of code"
23
23
  end
24
24
  end
@@ -1,20 +1,8 @@
1
- ## A minimalistic, declarative HTML scraper
2
-
1
+ # A minimalistic, declarative HTML scraper
3
2
  class Nibbler
4
3
  attr_reader :doc
5
4
 
6
- # Accepts string, open file, or Nokogiri-like document
7
- def initialize(doc)
8
- @doc = self.class.convert_document(doc)
9
- initialize_plural_accessors
10
- end
11
-
12
- # Initialize a new scraper and process data
13
- def self.parse(html)
14
- new(html).parse
15
- end
16
-
17
- # Specify a new singular scraping rule
5
+ # Declare a singular scraping rule
18
6
  def self.element(*args, &block)
19
7
  selector, name, delegate = parse_rule_declaration(*args, &block)
20
8
  rules[name] = [selector, delegate]
@@ -22,19 +10,27 @@ class Nibbler
22
10
  name
23
11
  end
24
12
 
25
- # Specify a new plural scraping rule
13
+ # Declare a plural scraping rule
26
14
  def self.elements(*args, &block)
27
15
  name = element(*args, &block)
28
16
  rules[name] << true
29
17
  end
30
18
 
31
- # Let it do its thing!
19
+ # Process data by creating a new scraper
20
+ def self.parse(data) new(data).parse end
21
+
22
+ # Initialize the parser with raw data or a document
23
+ def initialize(data)
24
+ @doc = self.class.convert_document(data)
25
+ # initialize plural properties
26
+ self.class.rules.each { |name, (s, k, plural)| send("#{name}=", []) if plural }
27
+ end
28
+
29
+ # Parse the document and save values returned by selectors
32
30
  def parse
33
31
  self.class.rules.each do |target, (selector, delegate, plural)|
34
32
  if plural
35
- @doc.search(selector).each do |node|
36
- send(target) << parse_result(node, delegate)
37
- end
33
+ send(target).concat @doc.search(selector).map { |i| parse_result(i, delegate) }
38
34
  else
39
35
  send("#{target}=", parse_result(@doc.at(selector), delegate))
40
36
  end
@@ -42,6 +38,16 @@ class Nibbler
42
38
  self
43
39
  end
44
40
 
41
+ # Dump the extracted data into a hash with symbolized keys
42
+ def to_hash
43
+ converter = lambda { |obj| obj.respond_to?(:to_hash) ? obj.to_hash : obj }
44
+ self.class.rules.keys.inject({}) do |hash, name|
45
+ value = send(name)
46
+ hash[name.to_sym] = Array === value ? value.map(&converter) : converter[value]
47
+ hash
48
+ end
49
+ end
50
+
45
51
  protected
46
52
 
47
53
  # `delegate` is optional, but should respond to `call` or `parse`
@@ -57,18 +63,20 @@ class Nibbler
57
63
 
58
64
  private
59
65
 
66
+ # Parsing rules declared with `element` or `elements`
60
67
  def self.rules
61
68
  @rules ||= {}
62
69
  end
63
70
 
71
+ # Make subclasses inherit the parsing rules
64
72
  def self.inherited(subclass)
65
73
  subclass.rules.update self.rules
66
74
  end
67
75
 
68
- # Rule declaration is in Hash or single argument form:
76
+ # Rule declaration forms:
69
77
  #
70
- # { '//some/selector' => :name, :with => delegate }
71
- # #=> ['//some/selector', :name, delegate]
78
+ # { 'selector' => :property, :with => delegate }
79
+ # #=> ['selector', :property, delegate]
72
80
  #
73
81
  # :title
74
82
  # #=> ['title', :title, nil]
@@ -82,18 +90,12 @@ class Nibbler
82
90
  return selector, property, delegate
83
91
  end
84
92
 
85
- def initialize_plural_accessors
86
- self.class.rules.each do |name, (s, k, plural)|
87
- send("#{name}=", []) if plural
88
- end
89
- end
90
-
93
+ # Parse data with Nokogiri unless it's already an acceptable document
91
94
  def self.convert_document(doc)
92
- unless doc.respond_to?(:at) && doc.respond_to?(:search)
95
+ if doc.respond_to?(:at) and doc.respond_to?(:search) then doc
96
+ else
93
97
  require 'nokogiri' unless defined? ::Nokogiri
94
98
  Nokogiri doc
95
- else
96
- doc
97
99
  end
98
100
  end
99
101
  end
@@ -180,6 +182,16 @@ if __FILE__ == $0
180
182
  @blog.navigation_items.should == %w[Home About Help]
181
183
  end
182
184
 
185
+ it "should convert to hash" do
186
+ hash = @blog.to_hash
187
+ hash[:navigation_items].should == %w[Home About Help]
188
+ hash[:title].should == "Maximum awesome"
189
+ article = hash[:articles].first
190
+ article[:title] == "First article"
191
+ article.key?(:link).should be_true
192
+ article[:link].should be_nil
193
+ end
194
+
183
195
  it "should have title, pubdate for first article" do
184
196
  article = @blog.articles[0]
185
197
  article.title.should == 'First article'
metadata CHANGED
@@ -1,12 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: nibbler
3
3
  version: !ruby/object:Gem::Version
4
- hash: 15
4
+ hash: 19
5
5
  prerelease: false
6
6
  segments:
7
7
  - 1
8
+ - 1
8
9
  - 0
9
- version: "1.0"
10
+ version: 1.1.0
10
11
  platform: ruby
11
12
  authors:
12
13
  - "Mislav Marohni\xC4\x87"
@@ -14,7 +15,7 @@ autorequire:
14
15
  bindir: bin
15
16
  cert_chain: []
16
17
 
17
- date: 2010-08-16 00:00:00 +02:00
18
+ date: 2010-08-30 00:00:00 +02:00
18
19
  default_executable:
19
20
  dependencies:
20
21
  - !ruby/object:Gem::Dependency