graboid 0.3.5 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
data/Rakefile CHANGED
@@ -12,7 +12,6 @@ begin
12
12
  gem.authors = ["Christopher Burnett"]
13
13
  gem.add_development_dependency "rspec", ">= 1.2.9"
14
14
  gem.add_dependency "nokogiri"
15
- gem.add_dependency "activesupport"
16
15
  # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
17
16
  end
18
17
  Jeweler::GemcutterTasks.new
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.3.5
1
+ 0.4.0
@@ -1,61 +1,57 @@
1
1
  # Generated by jeweler
2
2
  # DO NOT EDIT THIS FILE DIRECTLY
3
- # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
4
  # -*- encoding: utf-8 -*-
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{graboid}
8
- s.version = "0.3.5"
8
+ s.version = "0.4.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Christopher Burnett"]
12
- s.date = %q{2010-07-09}
12
+ s.date = %q{2011-02-22}
13
13
  s.description = %q{web scraping made easier}
14
14
  s.email = %q{signalstatic@gmail.com}
15
15
  s.extra_rdoc_files = [
16
16
  "LICENSE",
17
- "README.mdown"
17
+ "README.mdown"
18
18
  ]
19
19
  s.files = [
20
20
  ".document",
21
- ".gitignore",
22
- "LICENSE",
23
- "README.mdown",
24
- "Rakefile",
25
- "VERSION",
26
- "examples/active_rain_post.rb",
27
- "examples/live_journal_post.rb",
28
- "examples/ning_post.rb",
29
- "graboid.gemspec",
30
- "lib/graboid.rb",
31
- "lib/graboid/entity.rb",
32
- "lib/graboid/scraper.rb",
33
- "spec/fixtures/graboid.jpg",
34
- "spec/fixtures/posts.html",
35
- "spec/fixtures/server.rb",
36
- "spec/fixtures/views/posts.erb",
37
- "spec/graboid/entity_spec.rb",
38
- "spec/graboid/scraper_spec.rb",
39
- "spec/graboid_spec.rb",
40
- "spec/spec.opts",
41
- "spec/spec_helper.rb"
21
+ "LICENSE",
22
+ "README.mdown",
23
+ "Rakefile",
24
+ "VERSION",
25
+ "examples/active_rain_post.rb",
26
+ "examples/live_journal_post.rb",
27
+ "examples/ning_post.rb",
28
+ "graboid.gemspec",
29
+ "lib/graboid.rb",
30
+ "lib/graboid/entity.rb",
31
+ "lib/graboid/scraper.rb",
32
+ "spec/fixtures/graboid.jpg",
33
+ "spec/fixtures/posts.html",
34
+ "spec/fixtures/server.rb",
35
+ "spec/fixtures/views/posts.erb",
36
+ "spec/graboid/entity_spec.rb",
37
+ "spec/graboid/scraper_spec.rb",
38
+ "spec/graboid_spec.rb",
39
+ "spec/spec.opts",
40
+ "spec/spec_helper.rb"
42
41
  ]
43
42
  s.homepage = %q{http://github.com/twoism/graboid}
44
- s.rdoc_options = ["--charset=UTF-8"]
45
43
  s.require_paths = ["lib"]
46
44
  s.rubygems_version = %q{1.3.7}
47
45
  s.summary = %q{web scraping made easy}
48
46
  s.test_files = [
47
+ "examples/active_rain_post.rb",
48
+ "examples/live_journal_post.rb",
49
+ "examples/ning_post.rb",
49
50
  "spec/fixtures/server.rb",
50
- "spec/graboid/entity_spec.rb",
51
- "spec/graboid/scraper_spec.rb",
52
- "spec/graboid_spec.rb",
53
- "spec/spec_helper.rb",
54
- "examples/active_rain_post.rb",
55
- "examples/live_journal_post.rb",
56
- "examples/ning_post.rb",
57
- "examples/reddit_post.rb",
58
- "examples/tumblr_post.rb"
51
+ "spec/graboid/entity_spec.rb",
52
+ "spec/graboid/scraper_spec.rb",
53
+ "spec/graboid_spec.rb",
54
+ "spec/spec_helper.rb"
59
55
  ]
60
56
 
61
57
  if s.respond_to? :specification_version then
@@ -65,16 +61,13 @@ Gem::Specification.new do |s|
65
61
  if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
66
62
  s.add_development_dependency(%q<rspec>, [">= 1.2.9"])
67
63
  s.add_runtime_dependency(%q<nokogiri>, [">= 0"])
68
- s.add_runtime_dependency(%q<activesupport>, [">= 0"])
69
64
  else
70
65
  s.add_dependency(%q<rspec>, [">= 1.2.9"])
71
66
  s.add_dependency(%q<nokogiri>, [">= 0"])
72
- s.add_dependency(%q<activesupport>, [">= 0"])
73
67
  end
74
68
  else
75
69
  s.add_dependency(%q<rspec>, [">= 1.2.9"])
76
70
  s.add_dependency(%q<nokogiri>, [">= 0"])
77
- s.add_dependency(%q<activesupport>, [">= 0"])
78
71
  end
79
72
  end
80
73
 
@@ -1,10 +1,20 @@
1
- %w{rubygems nokogiri open-uri active_support ostruct}.each { |f| require f }
1
+ %w{rubygems nokogiri open-uri ostruct}.each { |f| require f }
2
2
 
3
3
  dir = Pathname(__FILE__).dirname.expand_path
4
4
 
5
5
  require dir + 'graboid/entity'
6
6
  require dir + 'graboid/scraper'
7
7
 
8
+ class String
9
+ def underscore
10
+ self.to_s.gsub(/::/, '/').
11
+ gsub(/([A-Z]+)([A-Z][a-z])/,'\1_\2').
12
+ gsub(/([a-z\d])([A-Z])/,'\1_\2').
13
+ tr("-", "_").
14
+ downcase
15
+ end
16
+ end
17
+
8
18
  module Graboid
9
19
  extend self
10
20
 
@@ -15,4 +25,4 @@ module Graboid
15
25
  def user_agent=(agent)
16
26
  @user_agent = agent
17
27
  end
18
- end
28
+ end
@@ -1,68 +1,87 @@
1
1
  module Graboid
2
2
  module Entity
3
-
3
+
4
4
  def self.included klass
5
5
  klass.class_eval do
6
6
  extend ClassMethods
7
7
  include InstanceMethods
8
- warn "Deprecation Warning! Graboid::Entity - This module has been deprecated. See Graboid::Scraper."
9
- write_inheritable_attribute(:attribute_map, {}) if attribute_map.nil?
8
+
9
+ inherited_attributes :attribute_map
10
+ @attribute_map = {}
10
11
  end
11
12
  end
12
-
13
+
13
14
  module ClassMethods
14
-
15
+
16
+ def inherited_attributes(*args)
17
+ @inherited_attributes ||= [:inherited_attributes]
18
+ @inherited_attributes += args
19
+ args.each do |arg|
20
+ class_eval %(
21
+ class << self; attr_accessor :#{arg} end
22
+ )
23
+ end
24
+ @inherited_attributes
25
+ end
26
+
27
+ def inherited(subclass)
28
+ @inherited_attributes.each do |inheritable_attribute|
29
+ instance_var = "@#{inheritable_attribute}"
30
+ subclass.instance_variable_set(instance_var, instance_variable_get(instance_var))
31
+ end
32
+ end
33
+
15
34
  def source
16
35
  @source
17
36
  end
18
-
37
+
19
38
  def source=(src)
20
39
  @source = src
21
40
  end
22
-
41
+
23
42
  def set name, opts={}, &block
24
- opts.merge!(:selector => ".#{name}") unless opts[:selector].present?
43
+ opts.merge!(:selector => ".#{name}") if opts[:selector].nil?
25
44
  opts.merge!(:processor => block) if block_given?
26
-
45
+
27
46
  attribute_map[name] = opts
28
47
  end
29
-
48
+
30
49
  alias_method :field, :set
31
-
50
+
32
51
  def selector selector
33
52
  @root_selector = selector
34
53
  end
35
-
54
+
36
55
  alias_method :root, :selector
37
56
 
38
57
  def root_selector
39
58
  @root_selector || inferred_selector
40
59
  end
41
-
60
+
42
61
  def inferred_selector
43
62
  @inferred_selector ||= ".#{self.to_s.underscore}"
44
63
  end
45
-
64
+
46
65
  def doc
47
66
  eval "Nokogiri::#{self.mode.to_s.upcase}(read_source)"
48
67
  end
49
-
68
+
50
69
  def collection
51
70
  @collection ||= []
52
71
  end
53
-
72
+
54
73
  def collection=(col)
55
74
  @collection = col
56
75
  end
57
-
76
+
58
77
  def attribute_map
59
78
  read_inheritable_attribute :attribute_map
60
79
  end
61
-
80
+
62
81
  def extract_instance fragment
63
82
  new(hash_map(fragment))
64
83
  end
65
-
84
+
66
85
  def hash_map fragment
67
86
  attribute_map.inject({}) do |extracted_hash, at|
68
87
  selector, processor = at.last[:selector], at.last[:processor]
@@ -72,7 +91,7 @@ module Graboid
72
91
  extracted_hash
73
92
  end
74
93
  end
75
-
94
+
76
95
  def all_fragments
77
96
  return page_fragments if @pager.nil?
78
97
  old_source = self.source
@@ -85,13 +104,13 @@ module Graboid
85
104
  self.source = old_source
86
105
  self.collection
87
106
  end
88
-
107
+
89
108
  def paginate
90
109
  next_page_url = @pager.call(doc) rescue nil
91
110
  self.source = next_page_url
92
111
  self.current_page += 1
93
112
  end
94
-
113
+
95
114
  def next_page?
96
115
  if max_pages.zero?
97
116
  return true unless @pager.call(doc).nil?
@@ -99,23 +118,23 @@ module Graboid
99
118
  current_page <= max_pages-1
100
119
  end
101
120
  end
102
-
121
+
103
122
  def page_fragments
104
123
  doc.css(root_selector)
105
124
  end
106
-
125
+
107
126
  def all opts={}
108
127
  reset_context
109
- self.max_pages = opts[:max_pages] if opts[:max_pages].present?
128
+ self.max_pages = opts[:max_pages] unless opts[:max_pages].nil?
110
129
  all_fragments.collect{ |frag| extract_instance(frag) }
111
130
  end
112
-
131
+
113
132
  def reset_context
114
133
  self.collection = []
115
134
  self.current_page = 0
116
135
  self.max_pages = 0
117
136
  end
118
-
137
+
119
138
  def read_source
120
139
  case self.source
121
140
  when /^http[s]?:\/\//
@@ -124,36 +143,36 @@ module Graboid
124
143
  self.source
125
144
  end
126
145
  end
127
-
146
+
128
147
  def pager &block
129
148
  @pager = block
130
149
  end
131
-
150
+
132
151
  def mode
133
152
  @mode ||= :html
134
153
  end
135
-
154
+
136
155
  def mode=(m)
137
156
  raise ArgumentError unless [:html, :xml].include?(m)
138
157
  @mode = m
139
158
  end
140
-
159
+
141
160
  def max_pages
142
161
  @max_pages ||= 0
143
162
  end
144
-
163
+
145
164
  def max_pages=num
146
165
  @max_pages = num
147
166
  end
148
-
167
+
149
168
  def current_page
150
169
  @current_page ||= 0
151
170
  end
152
-
171
+
153
172
  def current_page=num
154
173
  @current_page = num
155
174
  end
156
-
175
+
157
176
  instance_eval do
158
177
  [:before, :after].each do |prefix|
159
178
  [:paginate, :extract].each do |suffix|
@@ -163,29 +182,29 @@ module Graboid
163
182
  end
164
183
  define_method "run_#{method_name}_callbacks" do
165
184
  ivar = instance_variable_get("@#{method_name}")
166
- self.class_eval { ivar.call } unless ivar.nil?
185
+ class_eval { ivar.call } unless ivar.nil?
167
186
  end
168
187
  end
169
- end
170
- end
171
-
188
+ end
189
+ end
190
+
172
191
  end # ClassMethods
173
-
192
+
174
193
  module InstanceMethods
175
-
194
+
176
195
  def initialize opts={}
177
196
  opts.each do |k,v|
178
- self.class_eval do
197
+ self.class.class_eval do
179
198
  define_method k do
180
199
  v
181
200
  end
182
201
  end
183
202
  end
184
203
  end
185
-
204
+
186
205
  def attribute_map
187
206
  self.class.attribute_map
188
207
  end
189
208
  end # InstanceMethods
190
209
  end
191
- end
210
+ end
@@ -1,19 +1,35 @@
1
1
  module Graboid
2
+
2
3
  module Scraper
3
4
  def self.included klass
4
5
  klass.class_eval do
5
6
  extend ClassMethods
6
7
  include InstanceMethods
7
-
8
- write_inheritable_attribute(:attribute_map, {}) if attribute_map.nil?
9
- write_inheritable_attribute(:callbacks, {}) if callbacks.nil?
8
+
9
+ inherited_attributes :attribute_map, :callbacks
10
+ @attribute_map = {}
11
+ @callbacks = {}
10
12
  end
11
13
  end
12
-
14
+
13
15
  module ClassMethods
14
-
15
- def attribute_map
16
- read_inheritable_attribute :attribute_map
16
+
17
+ def inherited_attributes(*args)
18
+ @inherited_attributes ||= [:inherited_attributes]
19
+ @inherited_attributes += args
20
+ args.each do |arg|
21
+ class_eval %(
22
+ class << self; attr_accessor :#{arg} end
23
+ )
24
+ end
25
+ @inherited_attributes
26
+ end
27
+
28
+ def inherited(subclass)
29
+ @inherited_attributes.each do |inheritable_attribute|
30
+ instance_var = "@#{inheritable_attribute}"
31
+ subclass.instance_variable_set(instance_var, instance_variable_get(instance_var))
32
+ end
17
33
  end
18
34
 
19
35
  def callbacks
@@ -41,7 +57,7 @@ module Graboid
41
57
  alias_method :root, :selector
42
58
 
43
59
  def set name, opts={}, &block
44
- opts.merge!(:selector => ".#{name}") unless opts[:selector].present?
60
+ opts.merge!(:selector => ".#{name}") if opts[:selector].nil?
45
61
  opts.merge!(:processor => block) if block_given?
46
62
 
47
63
  attribute_map[name] = opts
@@ -60,14 +76,14 @@ module Graboid
60
76
 
61
77
  module InstanceMethods
62
78
  def initialize opts={}, &block
63
- raise ArgumentError unless opts[:source].present?
79
+ raise ArgumentError if opts[:source].nil?
64
80
  self.source = opts[:source]
65
81
  end
66
82
 
67
83
  def all opts={}, reload=false
68
84
  return self.collection if reload and !self.collection.empty?
69
85
  reset_context
70
- self.max_pages = opts[:max_pages] if opts[:max_pages].present?
86
+ self.max_pages = opts[:max_pages] unless opts[:max_pages].nil?
71
87
  all_fragments.collect{ |frag| extract_instance(frag) }
72
88
  end
73
89
 
@@ -202,11 +218,11 @@ module Graboid
202
218
  [:paginate, :extract].each do |suffix|
203
219
  method_name = "#{prefix}_#{suffix}"
204
220
  define_method "run_#{method_name}_callbacks" do
205
- self.instance_eval &callbacks[method_name.to_sym] if callbacks[method_name.to_sym].present?
221
+ self.instance_eval &callbacks[method_name.to_sym] unless callbacks[method_name.to_sym].nil?
206
222
  end
207
223
  end
208
224
  end
209
225
 
210
226
  end
211
227
  end
212
- end
228
+ end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: graboid
3
3
  version: !ruby/object:Gem::Version
4
- hash: 25
4
+ hash: 15
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
- - 3
9
- - 5
10
- version: 0.3.5
8
+ - 4
9
+ - 0
10
+ version: 0.4.0
11
11
  platform: ruby
12
12
  authors:
13
13
  - Christopher Burnett
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-07-09 00:00:00 -07:00
18
+ date: 2011-02-22 00:00:00 -08:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -48,20 +48,6 @@ dependencies:
48
48
  version: "0"
49
49
  type: :runtime
50
50
  version_requirements: *id002
51
- - !ruby/object:Gem::Dependency
52
- name: activesupport
53
- prerelease: false
54
- requirement: &id003 !ruby/object:Gem::Requirement
55
- none: false
56
- requirements:
57
- - - ">="
58
- - !ruby/object:Gem::Version
59
- hash: 3
60
- segments:
61
- - 0
62
- version: "0"
63
- type: :runtime
64
- version_requirements: *id003
65
51
  description: web scraping made easier
66
52
  email: signalstatic@gmail.com
67
53
  executables: []
@@ -73,7 +59,6 @@ extra_rdoc_files:
73
59
  - README.mdown
74
60
  files:
75
61
  - .document
76
- - .gitignore
77
62
  - LICENSE
78
63
  - README.mdown
79
64
  - Rakefile
@@ -94,15 +79,13 @@ files:
94
79
  - spec/graboid_spec.rb
95
80
  - spec/spec.opts
96
81
  - spec/spec_helper.rb
97
- - examples/reddit_post.rb
98
- - examples/tumblr_post.rb
99
82
  has_rdoc: true
100
83
  homepage: http://github.com/twoism/graboid
101
84
  licenses: []
102
85
 
103
86
  post_install_message:
104
- rdoc_options:
105
- - --charset=UTF-8
87
+ rdoc_options: []
88
+
106
89
  require_paths:
107
90
  - lib
108
91
  required_ruby_version: !ruby/object:Gem::Requirement
@@ -131,13 +114,11 @@ signing_key:
131
114
  specification_version: 3
132
115
  summary: web scraping made easy
133
116
  test_files:
117
+ - examples/active_rain_post.rb
118
+ - examples/live_journal_post.rb
119
+ - examples/ning_post.rb
134
120
  - spec/fixtures/server.rb
135
121
  - spec/graboid/entity_spec.rb
136
122
  - spec/graboid/scraper_spec.rb
137
123
  - spec/graboid_spec.rb
138
124
  - spec/spec_helper.rb
139
- - examples/active_rain_post.rb
140
- - examples/live_journal_post.rb
141
- - examples/ning_post.rb
142
- - examples/reddit_post.rb
143
- - examples/tumblr_post.rb
data/.gitignore DELETED
@@ -1,21 +0,0 @@
1
- ## MAC OS
2
- .DS_Store
3
-
4
- ## TEXTMATE
5
- *.tmproj
6
- tmtags
7
-
8
- ## EMACS
9
- *~
10
- \#*
11
- .\#*
12
-
13
- ## VIM
14
- *.swp
15
-
16
- ## PROJECT::GENERAL
17
- coverage
18
- rdoc
19
- pkg
20
-
21
- ## PROJECT::SPECIFIC
@@ -1,35 +0,0 @@
1
- dir = File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib'))
2
- require File.join(dir, 'graboid')
3
-
4
- class RedditEntry
5
- include Graboid::Scraper
6
-
7
- selector '.entry'
8
-
9
- set :title
10
- set :domain, :selector => '.domain a'
11
-
12
- set :link, :selector => '.title' do |entry|
13
- entry.css('a').first['href']
14
- end
15
-
16
- page_with do |doc|
17
- self.doc.css('p.nextprev a').select{|a| a.text =~ /next/i }.first['href']
18
- end
19
-
20
- before_paginate do
21
- puts "opening page: #{self.source}"
22
- puts "collection size: #{self.collection.length}"
23
- puts "#{"*"*100}"
24
- end
25
-
26
- end
27
-
28
- @posts = RedditEntry.new( :source => 'http://reddit.com' ).all( :max_pages => 3 )
29
-
30
- @posts.each do |p|
31
- puts "title: #{p.title}"
32
- puts "domain: #{p.domain}"
33
- puts "link: #{p.link}"
34
- puts "#{"*"*100}"
35
- end
@@ -1,33 +0,0 @@
1
- dir = File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib'))
2
- require File.join(dir, 'graboid')
3
-
4
- class TumblrEntry
5
- include Graboid::Scraper
6
- TUMBLR_CHUNK_SIZE = 20
7
-
8
- selector 'post'
9
-
10
- set :title, :selector => 'regular-title'
11
-
12
- page_with do |doc|
13
- next_tumblr_page
14
- end
15
-
16
- def next_tumblr_page
17
- return nil if self.doc.css('post').empty?
18
- "#{self.original_source}?start=#{self.current_page*TUMBLR_CHUNK_SIZE}"
19
- end
20
-
21
- before_paginate do
22
- puts "opening page: #{self.source}"
23
- puts "collection size: #{self.collection.length}"
24
- puts "#{"*"*100}"
25
- end
26
-
27
- end
28
-
29
- @posts = TumblrEntry.new( :source => 'http://chrisburnett.tumblr.com/api/read' ).all
30
-
31
- @posts.each do |p|
32
- puts "title: #{p.title}"
33
- end