graboid 0.3.5 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Rakefile CHANGED
@@ -12,7 +12,6 @@ begin
12
12
  gem.authors = ["Christopher Burnett"]
13
13
  gem.add_development_dependency "rspec", ">= 1.2.9"
14
14
  gem.add_dependency "nokogiri"
15
- gem.add_dependency "activesupport"
16
15
  # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
17
16
  end
18
17
  Jeweler::GemcutterTasks.new
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.3.5
1
+ 0.4.0
@@ -1,61 +1,57 @@
1
1
  # Generated by jeweler
2
2
  # DO NOT EDIT THIS FILE DIRECTLY
3
- # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
4
  # -*- encoding: utf-8 -*-
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{graboid}
8
- s.version = "0.3.5"
8
+ s.version = "0.4.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Christopher Burnett"]
12
- s.date = %q{2010-07-09}
12
+ s.date = %q{2011-02-22}
13
13
  s.description = %q{web scraping made easier}
14
14
  s.email = %q{signalstatic@gmail.com}
15
15
  s.extra_rdoc_files = [
16
16
  "LICENSE",
17
- "README.mdown"
17
+ "README.mdown"
18
18
  ]
19
19
  s.files = [
20
20
  ".document",
21
- ".gitignore",
22
- "LICENSE",
23
- "README.mdown",
24
- "Rakefile",
25
- "VERSION",
26
- "examples/active_rain_post.rb",
27
- "examples/live_journal_post.rb",
28
- "examples/ning_post.rb",
29
- "graboid.gemspec",
30
- "lib/graboid.rb",
31
- "lib/graboid/entity.rb",
32
- "lib/graboid/scraper.rb",
33
- "spec/fixtures/graboid.jpg",
34
- "spec/fixtures/posts.html",
35
- "spec/fixtures/server.rb",
36
- "spec/fixtures/views/posts.erb",
37
- "spec/graboid/entity_spec.rb",
38
- "spec/graboid/scraper_spec.rb",
39
- "spec/graboid_spec.rb",
40
- "spec/spec.opts",
41
- "spec/spec_helper.rb"
21
+ "LICENSE",
22
+ "README.mdown",
23
+ "Rakefile",
24
+ "VERSION",
25
+ "examples/active_rain_post.rb",
26
+ "examples/live_journal_post.rb",
27
+ "examples/ning_post.rb",
28
+ "graboid.gemspec",
29
+ "lib/graboid.rb",
30
+ "lib/graboid/entity.rb",
31
+ "lib/graboid/scraper.rb",
32
+ "spec/fixtures/graboid.jpg",
33
+ "spec/fixtures/posts.html",
34
+ "spec/fixtures/server.rb",
35
+ "spec/fixtures/views/posts.erb",
36
+ "spec/graboid/entity_spec.rb",
37
+ "spec/graboid/scraper_spec.rb",
38
+ "spec/graboid_spec.rb",
39
+ "spec/spec.opts",
40
+ "spec/spec_helper.rb"
42
41
  ]
43
42
  s.homepage = %q{http://github.com/twoism/graboid}
44
- s.rdoc_options = ["--charset=UTF-8"]
45
43
  s.require_paths = ["lib"]
46
44
  s.rubygems_version = %q{1.3.7}
47
45
  s.summary = %q{web scraping made easy}
48
46
  s.test_files = [
47
+ "examples/active_rain_post.rb",
48
+ "examples/live_journal_post.rb",
49
+ "examples/ning_post.rb",
49
50
  "spec/fixtures/server.rb",
50
- "spec/graboid/entity_spec.rb",
51
- "spec/graboid/scraper_spec.rb",
52
- "spec/graboid_spec.rb",
53
- "spec/spec_helper.rb",
54
- "examples/active_rain_post.rb",
55
- "examples/live_journal_post.rb",
56
- "examples/ning_post.rb",
57
- "examples/reddit_post.rb",
58
- "examples/tumblr_post.rb"
51
+ "spec/graboid/entity_spec.rb",
52
+ "spec/graboid/scraper_spec.rb",
53
+ "spec/graboid_spec.rb",
54
+ "spec/spec_helper.rb"
59
55
  ]
60
56
 
61
57
  if s.respond_to? :specification_version then
@@ -65,16 +61,13 @@ Gem::Specification.new do |s|
65
61
  if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
66
62
  s.add_development_dependency(%q<rspec>, [">= 1.2.9"])
67
63
  s.add_runtime_dependency(%q<nokogiri>, [">= 0"])
68
- s.add_runtime_dependency(%q<activesupport>, [">= 0"])
69
64
  else
70
65
  s.add_dependency(%q<rspec>, [">= 1.2.9"])
71
66
  s.add_dependency(%q<nokogiri>, [">= 0"])
72
- s.add_dependency(%q<activesupport>, [">= 0"])
73
67
  end
74
68
  else
75
69
  s.add_dependency(%q<rspec>, [">= 1.2.9"])
76
70
  s.add_dependency(%q<nokogiri>, [">= 0"])
77
- s.add_dependency(%q<activesupport>, [">= 0"])
78
71
  end
79
72
  end
80
73
 
@@ -1,10 +1,20 @@
1
- %w{rubygems nokogiri open-uri active_support ostruct}.each { |f| require f }
1
+ %w{rubygems nokogiri open-uri ostruct}.each { |f| require f }
2
2
 
3
3
  dir = Pathname(__FILE__).dirname.expand_path
4
4
 
5
5
  require dir + 'graboid/entity'
6
6
  require dir + 'graboid/scraper'
7
7
 
8
+ class String
9
+ def underscore
10
+ self.to_s.gsub(/::/, '/').
11
+ gsub(/([A-Z]+)([A-Z][a-z])/,'\1_\2').
12
+ gsub(/([a-z\d])([A-Z])/,'\1_\2').
13
+ tr("-", "_").
14
+ downcase
15
+ end
16
+ end
17
+
8
18
  module Graboid
9
19
  extend self
10
20
 
@@ -15,4 +25,4 @@ module Graboid
15
25
  def user_agent=(agent)
16
26
  @user_agent = agent
17
27
  end
18
- end
28
+ end
@@ -1,68 +1,87 @@
1
1
  module Graboid
2
2
  module Entity
3
-
3
+
4
4
  def self.included klass
5
5
  klass.class_eval do
6
6
  extend ClassMethods
7
7
  include InstanceMethods
8
- warn "Deprecation Warning! Graboid::Entity - This module has been deprecated. See Graboid::Scraper."
9
- write_inheritable_attribute(:attribute_map, {}) if attribute_map.nil?
8
+
9
+ inherited_attributes :attribute_map
10
+ @attribute_map = {}
10
11
  end
11
12
  end
12
-
13
+
13
14
  module ClassMethods
14
-
15
+
16
+ def inherited_attributes(*args)
17
+ @inherited_attributes ||= [:inherited_attributes]
18
+ @inherited_attributes += args
19
+ args.each do |arg|
20
+ class_eval %(
21
+ class << self; attr_accessor :#{arg} end
22
+ )
23
+ end
24
+ @inherited_attributes
25
+ end
26
+
27
+ def inherited(subclass)
28
+ @inherited_attributes.each do |inheritable_attribute|
29
+ instance_var = "@#{inheritable_attribute}"
30
+ subclass.instance_variable_set(instance_var, instance_variable_get(instance_var))
31
+ end
32
+ end
33
+
15
34
  def source
16
35
  @source
17
36
  end
18
-
37
+
19
38
  def source=(src)
20
39
  @source = src
21
40
  end
22
-
41
+
23
42
  def set name, opts={}, &block
24
- opts.merge!(:selector => ".#{name}") unless opts[:selector].present?
43
+ opts.merge!(:selector => ".#{name}") if opts[:selector].nil?
25
44
  opts.merge!(:processor => block) if block_given?
26
-
45
+
27
46
  attribute_map[name] = opts
28
47
  end
29
-
48
+
30
49
  alias_method :field, :set
31
-
50
+
32
51
  def selector selector
33
52
  @root_selector = selector
34
53
  end
35
-
54
+
36
55
  alias_method :root, :selector
37
56
 
38
57
  def root_selector
39
58
  @root_selector || inferred_selector
40
59
  end
41
-
60
+
42
61
  def inferred_selector
43
62
  @inferred_selector ||= ".#{self.to_s.underscore}"
44
63
  end
45
-
64
+
46
65
  def doc
47
66
  eval "Nokogiri::#{self.mode.to_s.upcase}(read_source)"
48
67
  end
49
-
68
+
50
69
  def collection
51
70
  @collection ||= []
52
71
  end
53
-
72
+
54
73
  def collection=(col)
55
74
  @collection = col
56
75
  end
57
-
76
+
58
77
  def attribute_map
59
78
  read_inheritable_attribute :attribute_map
60
79
  end
61
-
80
+
62
81
  def extract_instance fragment
63
82
  new(hash_map(fragment))
64
83
  end
65
-
84
+
66
85
  def hash_map fragment
67
86
  attribute_map.inject({}) do |extracted_hash, at|
68
87
  selector, processor = at.last[:selector], at.last[:processor]
@@ -72,7 +91,7 @@ module Graboid
72
91
  extracted_hash
73
92
  end
74
93
  end
75
-
94
+
76
95
  def all_fragments
77
96
  return page_fragments if @pager.nil?
78
97
  old_source = self.source
@@ -85,13 +104,13 @@ module Graboid
85
104
  self.source = old_source
86
105
  self.collection
87
106
  end
88
-
107
+
89
108
  def paginate
90
109
  next_page_url = @pager.call(doc) rescue nil
91
110
  self.source = next_page_url
92
111
  self.current_page += 1
93
112
  end
94
-
113
+
95
114
  def next_page?
96
115
  if max_pages.zero?
97
116
  return true unless @pager.call(doc).nil?
@@ -99,23 +118,23 @@ module Graboid
99
118
  current_page <= max_pages-1
100
119
  end
101
120
  end
102
-
121
+
103
122
  def page_fragments
104
123
  doc.css(root_selector)
105
124
  end
106
-
125
+
107
126
  def all opts={}
108
127
  reset_context
109
- self.max_pages = opts[:max_pages] if opts[:max_pages].present?
128
+ self.max_pages = opts[:max_pages] unless opts[:max_pages].nil?
110
129
  all_fragments.collect{ |frag| extract_instance(frag) }
111
130
  end
112
-
131
+
113
132
  def reset_context
114
133
  self.collection = []
115
134
  self.current_page = 0
116
135
  self.max_pages = 0
117
136
  end
118
-
137
+
119
138
  def read_source
120
139
  case self.source
121
140
  when /^http[s]?:\/\//
@@ -124,36 +143,36 @@ module Graboid
124
143
  self.source
125
144
  end
126
145
  end
127
-
146
+
128
147
  def pager &block
129
148
  @pager = block
130
149
  end
131
-
150
+
132
151
  def mode
133
152
  @mode ||= :html
134
153
  end
135
-
154
+
136
155
  def mode=(m)
137
156
  raise ArgumentError unless [:html, :xml].include?(m)
138
157
  @mode = m
139
158
  end
140
-
159
+
141
160
  def max_pages
142
161
  @max_pages ||= 0
143
162
  end
144
-
163
+
145
164
  def max_pages=num
146
165
  @max_pages = num
147
166
  end
148
-
167
+
149
168
  def current_page
150
169
  @current_page ||= 0
151
170
  end
152
-
171
+
153
172
  def current_page=num
154
173
  @current_page = num
155
174
  end
156
-
175
+
157
176
  instance_eval do
158
177
  [:before, :after].each do |prefix|
159
178
  [:paginate, :extract].each do |suffix|
@@ -163,29 +182,29 @@ module Graboid
163
182
  end
164
183
  define_method "run_#{method_name}_callbacks" do
165
184
  ivar = instance_variable_get("@#{method_name}")
166
- self.class_eval { ivar.call } unless ivar.nil?
185
+ class_eval { ivar.call } unless ivar.nil?
167
186
  end
168
187
  end
169
- end
170
- end
171
-
188
+ end
189
+ end
190
+
172
191
  end # ClassMethods
173
-
192
+
174
193
  module InstanceMethods
175
-
194
+
176
195
  def initialize opts={}
177
196
  opts.each do |k,v|
178
- self.class_eval do
197
+ self.class.class_eval do
179
198
  define_method k do
180
199
  v
181
200
  end
182
201
  end
183
202
  end
184
203
  end
185
-
204
+
186
205
  def attribute_map
187
206
  self.class.attribute_map
188
207
  end
189
208
  end # InstanceMethods
190
209
  end
191
- end
210
+ end
@@ -1,19 +1,35 @@
1
1
  module Graboid
2
+
2
3
  module Scraper
3
4
  def self.included klass
4
5
  klass.class_eval do
5
6
  extend ClassMethods
6
7
  include InstanceMethods
7
-
8
- write_inheritable_attribute(:attribute_map, {}) if attribute_map.nil?
9
- write_inheritable_attribute(:callbacks, {}) if callbacks.nil?
8
+
9
+ inherited_attributes :attribute_map, :callbacks
10
+ @attribute_map = {}
11
+ @callbacks = {}
10
12
  end
11
13
  end
12
-
14
+
13
15
  module ClassMethods
14
-
15
- def attribute_map
16
- read_inheritable_attribute :attribute_map
16
+
17
+ def inherited_attributes(*args)
18
+ @inherited_attributes ||= [:inherited_attributes]
19
+ @inherited_attributes += args
20
+ args.each do |arg|
21
+ class_eval %(
22
+ class << self; attr_accessor :#{arg} end
23
+ )
24
+ end
25
+ @inherited_attributes
26
+ end
27
+
28
+ def inherited(subclass)
29
+ @inherited_attributes.each do |inheritable_attribute|
30
+ instance_var = "@#{inheritable_attribute}"
31
+ subclass.instance_variable_set(instance_var, instance_variable_get(instance_var))
32
+ end
17
33
  end
18
34
 
19
35
  def callbacks
@@ -41,7 +57,7 @@ module Graboid
41
57
  alias_method :root, :selector
42
58
 
43
59
  def set name, opts={}, &block
44
- opts.merge!(:selector => ".#{name}") unless opts[:selector].present?
60
+ opts.merge!(:selector => ".#{name}") if opts[:selector].nil?
45
61
  opts.merge!(:processor => block) if block_given?
46
62
 
47
63
  attribute_map[name] = opts
@@ -60,14 +76,14 @@ module Graboid
60
76
 
61
77
  module InstanceMethods
62
78
  def initialize opts={}, &block
63
- raise ArgumentError unless opts[:source].present?
79
+ raise ArgumentError if opts[:source].nil?
64
80
  self.source = opts[:source]
65
81
  end
66
82
 
67
83
  def all opts={}, reload=false
68
84
  return self.collection if reload and !self.collection.empty?
69
85
  reset_context
70
- self.max_pages = opts[:max_pages] if opts[:max_pages].present?
86
+ self.max_pages = opts[:max_pages] unless opts[:max_pages].nil?
71
87
  all_fragments.collect{ |frag| extract_instance(frag) }
72
88
  end
73
89
 
@@ -202,11 +218,11 @@ module Graboid
202
218
  [:paginate, :extract].each do |suffix|
203
219
  method_name = "#{prefix}_#{suffix}"
204
220
  define_method "run_#{method_name}_callbacks" do
205
- self.instance_eval &callbacks[method_name.to_sym] if callbacks[method_name.to_sym].present?
221
+ self.instance_eval &callbacks[method_name.to_sym] unless callbacks[method_name.to_sym].nil?
206
222
  end
207
223
  end
208
224
  end
209
225
 
210
226
  end
211
227
  end
212
- end
228
+ end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: graboid
3
3
  version: !ruby/object:Gem::Version
4
- hash: 25
4
+ hash: 15
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
- - 3
9
- - 5
10
- version: 0.3.5
8
+ - 4
9
+ - 0
10
+ version: 0.4.0
11
11
  platform: ruby
12
12
  authors:
13
13
  - Christopher Burnett
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-07-09 00:00:00 -07:00
18
+ date: 2011-02-22 00:00:00 -08:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -48,20 +48,6 @@ dependencies:
48
48
  version: "0"
49
49
  type: :runtime
50
50
  version_requirements: *id002
51
- - !ruby/object:Gem::Dependency
52
- name: activesupport
53
- prerelease: false
54
- requirement: &id003 !ruby/object:Gem::Requirement
55
- none: false
56
- requirements:
57
- - - ">="
58
- - !ruby/object:Gem::Version
59
- hash: 3
60
- segments:
61
- - 0
62
- version: "0"
63
- type: :runtime
64
- version_requirements: *id003
65
51
  description: web scraping made easier
66
52
  email: signalstatic@gmail.com
67
53
  executables: []
@@ -73,7 +59,6 @@ extra_rdoc_files:
73
59
  - README.mdown
74
60
  files:
75
61
  - .document
76
- - .gitignore
77
62
  - LICENSE
78
63
  - README.mdown
79
64
  - Rakefile
@@ -94,15 +79,13 @@ files:
94
79
  - spec/graboid_spec.rb
95
80
  - spec/spec.opts
96
81
  - spec/spec_helper.rb
97
- - examples/reddit_post.rb
98
- - examples/tumblr_post.rb
99
82
  has_rdoc: true
100
83
  homepage: http://github.com/twoism/graboid
101
84
  licenses: []
102
85
 
103
86
  post_install_message:
104
- rdoc_options:
105
- - --charset=UTF-8
87
+ rdoc_options: []
88
+
106
89
  require_paths:
107
90
  - lib
108
91
  required_ruby_version: !ruby/object:Gem::Requirement
@@ -131,13 +114,11 @@ signing_key:
131
114
  specification_version: 3
132
115
  summary: web scraping made easy
133
116
  test_files:
117
+ - examples/active_rain_post.rb
118
+ - examples/live_journal_post.rb
119
+ - examples/ning_post.rb
134
120
  - spec/fixtures/server.rb
135
121
  - spec/graboid/entity_spec.rb
136
122
  - spec/graboid/scraper_spec.rb
137
123
  - spec/graboid_spec.rb
138
124
  - spec/spec_helper.rb
139
- - examples/active_rain_post.rb
140
- - examples/live_journal_post.rb
141
- - examples/ning_post.rb
142
- - examples/reddit_post.rb
143
- - examples/tumblr_post.rb
data/.gitignore DELETED
@@ -1,21 +0,0 @@
1
- ## MAC OS
2
- .DS_Store
3
-
4
- ## TEXTMATE
5
- *.tmproj
6
- tmtags
7
-
8
- ## EMACS
9
- *~
10
- \#*
11
- .\#*
12
-
13
- ## VIM
14
- *.swp
15
-
16
- ## PROJECT::GENERAL
17
- coverage
18
- rdoc
19
- pkg
20
-
21
- ## PROJECT::SPECIFIC
@@ -1,35 +0,0 @@
1
- dir = File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib'))
2
- require File.join(dir, 'graboid')
3
-
4
- class RedditEntry
5
- include Graboid::Scraper
6
-
7
- selector '.entry'
8
-
9
- set :title
10
- set :domain, :selector => '.domain a'
11
-
12
- set :link, :selector => '.title' do |entry|
13
- entry.css('a').first['href']
14
- end
15
-
16
- page_with do |doc|
17
- self.doc.css('p.nextprev a').select{|a| a.text =~ /next/i }.first['href']
18
- end
19
-
20
- before_paginate do
21
- puts "opening page: #{self.source}"
22
- puts "collection size: #{self.collection.length}"
23
- puts "#{"*"*100}"
24
- end
25
-
26
- end
27
-
28
- @posts = RedditEntry.new( :source => 'http://reddit.com' ).all( :max_pages => 3 )
29
-
30
- @posts.each do |p|
31
- puts "title: #{p.title}"
32
- puts "domain: #{p.domain}"
33
- puts "link: #{p.link}"
34
- puts "#{"*"*100}"
35
- end
@@ -1,33 +0,0 @@
1
- dir = File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib'))
2
- require File.join(dir, 'graboid')
3
-
4
- class TumblrEntry
5
- include Graboid::Scraper
6
- TUMBLR_CHUNK_SIZE = 20
7
-
8
- selector 'post'
9
-
10
- set :title, :selector => 'regular-title'
11
-
12
- page_with do |doc|
13
- next_tumblr_page
14
- end
15
-
16
- def next_tumblr_page
17
- return nil if self.doc.css('post').empty?
18
- "#{self.original_source}?start=#{self.current_page*TUMBLR_CHUNK_SIZE}"
19
- end
20
-
21
- before_paginate do
22
- puts "opening page: #{self.source}"
23
- puts "collection size: #{self.collection.length}"
24
- puts "#{"*"*100}"
25
- end
26
-
27
- end
28
-
29
- @posts = TumblrEntry.new( :source => 'http://chrisburnett.tumblr.com/api/read' ).all
30
-
31
- @posts.each do |p|
32
- puts "title: #{p.title}"
33
- end