market_bot 0.9.1 → 0.10.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 9688ff41be35247c4bb0f85621bb800232111936
4
+ data.tar.gz: 368e41730331ab5f6084ce49d6a51cbe87904d51
5
+ SHA512:
6
+ metadata.gz: 34de9be2e58bfeab53156bd5b3248fcd412918616431ab03b3521b5065f55d0b07deb913f7f08a07cf512b5151dc1c8859a5e52f70c1e1c84cd86a9129d57fd3
7
+ data.tar.gz: 96e62fed37bee0a7ee35725a45470a3f912e7651e3868ae102f6135a7d6666e39abab95a76465e5849dafd86384745984bdebfb87949d792fbd50d11a393ef6a
data/Gemfile CHANGED
@@ -1,12 +1,12 @@
1
1
  source 'http://rubygems.org'
2
2
 
3
- gem 'typhoeus'
3
+ gem 'typhoeus', '~> 0.4.2'
4
4
  gem 'nokogiri'
5
5
 
6
6
  group :development do
7
7
  gem 'rspec', '~> 2.8.0'
8
- gem 'bundler', '~> 1.1.0'
9
- gem 'jeweler', '~> 1.6.4'
8
+ gem 'bundler', '~> 1.3.0'
9
+ gem 'jeweler', '~> 1.8.0'
10
10
  gem 'simplecov', '>= 0'
11
11
  gem 'rdoc', '>= 3.9.4'
12
12
  gem 'guard', '~> 1.2.3'
data/Gemfile.lock CHANGED
@@ -1,42 +1,72 @@
1
1
  GEM
2
2
  remote: http://rubygems.org/
3
3
  specs:
4
+ addressable (2.3.5)
5
+ builder (3.2.2)
4
6
  columnize (0.3.6)
5
- debugger (1.1.4)
7
+ debugger (1.6.1)
6
8
  columnize (>= 0.3.1)
7
- debugger-linecache (~> 1.1.1)
8
- debugger-ruby_core_source (~> 1.1.3)
9
- debugger-linecache (1.1.1)
10
- debugger-ruby_core_source (>= 1.1.1)
11
- debugger-ruby_core_source (1.1.3)
9
+ debugger-linecache (~> 1.2.0)
10
+ debugger-ruby_core_source (~> 1.2.3)
11
+ debugger-linecache (1.2.0)
12
+ debugger-ruby_core_source (1.2.3)
12
13
  diff-lcs (1.1.3)
13
- ffi (1.0.11)
14
+ faraday (0.8.7)
15
+ multipart-post (~> 1.1)
16
+ ffi (1.9.0)
14
17
  git (1.2.5)
18
+ github_api (0.10.1)
19
+ addressable
20
+ faraday (~> 0.8.1)
21
+ hashie (>= 1.2)
22
+ multi_json (~> 1.4)
23
+ nokogiri (~> 1.5.2)
24
+ oauth2
15
25
  growl (1.0.3)
16
26
  guard (1.2.3)
17
27
  listen (>= 0.4.2)
18
28
  thor (>= 0.14.6)
19
29
  guard-rspec (1.1.0)
20
30
  guard (>= 1.1)
21
- jeweler (1.6.4)
31
+ hashie (2.0.5)
32
+ highline (1.6.19)
33
+ httpauth (0.2.0)
34
+ jeweler (1.8.6)
35
+ builder
22
36
  bundler (~> 1.0)
23
37
  git (>= 1.2.5)
38
+ github_api (= 0.10.1)
39
+ highline (>= 1.6.15)
40
+ nokogiri (= 1.5.10)
24
41
  rake
25
- json (1.7.3)
26
- listen (0.4.7)
27
- rb-fchange (~> 0.0.5)
28
- rb-fsevent (~> 0.9.1)
29
- rb-inotify (~> 0.8.8)
30
- mime-types (1.18)
31
- multi_json (1.3.6)
32
- nokogiri (1.5.4)
33
- rake (0.9.2.2)
34
- rb-fchange (0.0.5)
35
- ffi
36
- rb-fsevent (0.9.1)
37
- rb-inotify (0.8.8)
42
+ rdoc
43
+ json (1.8.0)
44
+ jwt (0.1.8)
45
+ multi_json (>= 1.5)
46
+ listen (1.2.2)
47
+ rb-fsevent (>= 0.9.3)
48
+ rb-inotify (>= 0.9)
49
+ rb-kqueue (>= 0.2)
50
+ mime-types (1.23)
51
+ multi_json (1.7.7)
52
+ multi_xml (0.5.4)
53
+ multipart-post (1.2.0)
54
+ nokogiri (1.5.10)
55
+ oauth2 (0.9.2)
56
+ faraday (~> 0.8)
57
+ httpauth (~> 0.2)
58
+ jwt (~> 0.1.4)
59
+ multi_json (~> 1.0)
60
+ multi_xml (~> 0.5)
61
+ rack (~> 1.2)
62
+ rack (1.5.2)
63
+ rake (10.1.0)
64
+ rb-fsevent (0.9.3)
65
+ rb-inotify (0.9.0)
66
+ ffi (>= 0.5.0)
67
+ rb-kqueue (0.2.0)
38
68
  ffi (>= 0.5.0)
39
- rdoc (3.12)
69
+ rdoc (4.0.1)
40
70
  json (~> 1.4)
41
71
  rspec (2.8.0)
42
72
  rspec-core (~> 2.8.0)
@@ -46,11 +76,11 @@ GEM
46
76
  rspec-expectations (2.8.0)
47
77
  diff-lcs (~> 1.1.2)
48
78
  rspec-mocks (2.8.0)
49
- simplecov (0.6.4)
79
+ simplecov (0.7.1)
50
80
  multi_json (~> 1.0)
51
- simplecov-html (~> 0.5.3)
52
- simplecov-html (0.5.3)
53
- thor (0.15.4)
81
+ simplecov-html (~> 0.7.1)
82
+ simplecov-html (0.7.1)
83
+ thor (0.18.1)
54
84
  typhoeus (0.4.2)
55
85
  ffi (~> 1.0)
56
86
  mime-types (~> 1.18)
@@ -59,14 +89,14 @@ PLATFORMS
59
89
  ruby
60
90
 
61
91
  DEPENDENCIES
62
- bundler (~> 1.1.0)
92
+ bundler (~> 1.3.0)
63
93
  debugger
64
94
  growl
65
95
  guard (~> 1.2.3)
66
96
  guard-rspec (~> 1.1.0)
67
- jeweler (~> 1.6.4)
97
+ jeweler (~> 1.8.0)
68
98
  nokogiri
69
99
  rdoc (>= 3.9.4)
70
100
  rspec (~> 2.8.0)
71
101
  simplecov
72
- typhoeus
102
+ typhoeus (~> 0.4.2)
data/README.markdown CHANGED
@@ -6,6 +6,10 @@ Books, music, movies, etc aren't currently supported.
6
6
  It is built on top of Nokogiri and Typhoeus.
7
7
  Used in production to power [www.droidmeter.com](http://www.droidmeter.com/?t=github).
8
8
 
9
+ **This project is currently seeking developers to help maintain it.
10
+ Please send pull requests or contact me if you are able to help out.
11
+ The app scraper is known to work, but leader board functionality is currently broken.**
12
+
9
13
  ## Dependencies
10
14
 
11
15
  * Nokogiri
@@ -68,6 +72,11 @@ Used in production to power [www.droidmeter.com](http://www.droidmeter.com/?t=gi
68
72
  puts "First place app (#{first_app.title}) price: #{first_app.price}" unless first_app.error
69
73
  puts "Last place app (#{last_app.title}) price: #{last_app.price}" unless last_app.error
70
74
 
75
+ ## Excessive Use
76
+
77
+ Google will block your IP address if you attempt to scrape large quantities of data.
78
+ Please contact me if you are looking for commercial data solutions.
79
+
71
80
  ## Contributing to Market Bot
72
81
 
73
82
  1. Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.9.1
1
+ 0.10.0
@@ -19,132 +19,109 @@ module MarketBot
19
19
  result = {}
20
20
 
21
21
  doc = Nokogiri::HTML(html)
22
-
23
- elements = doc.css('.doc-metadata').first.elements[2].elements
24
- elem_count = elements.count
25
-
26
- (3..(elem_count - 1)).select{ |n| n.odd? }.each do |i|
27
- field_name = elements[i].text
22
+ meta_info = doc.css('.meta-info')
23
+ meta_info.each do |info|
24
+ field_name = info.css('.title').text.strip
28
25
 
29
26
  case field_name
30
- when 'Updated:'
31
- result[:updated] = elements[i + 1].text
32
- when 'Current Version:'
33
- result[:current_version] = elements[i + 1].text
34
- when 'Requires Android:'
35
- result[:requires_android] = elements[i + 1].text
36
- when 'Category:'
37
- result[:category] = elements[i + 1].text
38
- when 'Installs:'
39
- result[:installs] = elements[i + 1].children.first.text
40
- when 'Size:'
41
- result[:size] = elements[i + 1].text
42
- when 'Price:'
43
- result[:price] = elements[i + 1].text
44
- when 'Content Rating:'
45
- result[:content_rating] = elements[i + 1].text
27
+ when 'Updated'
28
+ result[:updated] = info.css('.content').text.strip
29
+ when 'Installs'
30
+ result[:installs] = info.css('.content').text.strip
31
+ when 'Size'
32
+ result[:size] = info.css('.content').text.strip
33
+ when 'Current Version'
34
+ result[:current_version] = info.css('.content').text.strip
35
+ when 'Requires Android'
36
+ result[:requires_android] = info.css('.content').text.strip
37
+ when 'Content Rating'
38
+ result[:content_rating] = info.css('.content').text.strip
39
+ when 'Contact Developer'
40
+ info.css('.dev-link').each do |node|
41
+ if node.text.strip.eql? 'Email Developer'
42
+ result[:email] = node[:href].gsub(/^mailto:/,'')
43
+ else
44
+ redirect_url = node[:href]
45
+ if q_param = URI(redirect_url).query.split('&').select{ |p| p =~ /q=/ }.first
46
+ actual_url = q_param.gsub('q=', '')
47
+ end
48
+
49
+ result[:website_url] = actual_url
50
+ end
51
+ end
52
+
46
53
  end
47
54
  end
48
55
 
49
- result[:description] = doc.css('#doc-original-text').first.inner_html
50
- result[:title] = doc.css('.doc-banner-title').text
56
+ node = doc.xpath("//meta[@itemprop='price']").first
57
+ result[:price] = node[:content].strip rescue 'Free'
51
58
 
52
- rating_elem = doc.css('.average-rating-value')
53
- result[:rating] = rating_elem.first.text unless rating_elem.empty?
59
+ result[:category] = doc.css('.category').first.text.strip rescue ''
60
+ result[:description] = doc.xpath("//div[@itemprop='description']").first.inner_html.strip
61
+ result[:title] = doc.xpath("//div[@itemprop='name']").first.text.strip
54
62
 
55
- votes_elem = doc.css('.votes')
56
- result[:votes] = doc.css('.votes').first.text unless votes_elem.empty?
63
+ score = doc.css('.score-container').first
64
+ unless score.nil?
65
+ node = score.css('.score').first
66
+ result[:rating] = node.text.strip
67
+ node = score.xpath("//meta[@itemprop='ratingCount']").first
68
+ result[:votes] = node[:content].strip
69
+ end
57
70
 
58
- result[:developer] = doc.css('.doc-banner-title-container a').text
71
+ node = doc.xpath("//div[@itemprop='author']")
72
+ result[:developer] = node.css('.primary').first.text.strip
59
73
 
60
74
  result[:more_from_developer] = []
61
75
  result[:users_also_installed] = []
62
76
  result[:related] = []
63
77
 
64
- if similar_elem = doc.css('.doc-similar').first
65
- similar_elem.children.each do |similar_elem_child|
66
- assoc_app_type = similar_elem_child.attributes['data-analyticsid'].text
67
-
68
- next unless %w(more-from-developer users-also-installed related).include?(assoc_app_type)
78
+ node = doc.css('.recommendation')
79
+ node.css('.rec-cluster').each do |recommended|
80
+ assoc_app_type = recommended.css('.heading').first.text.strip.eql?('Similar' ) ? :related : :more_from_developer
81
+ recommended.css('.card').each do |card|
82
+ assoc_app = {}
83
+ assoc_app[:app_id] = card['data-docid'].strip
69
84
 
70
- assoc_app_type = assoc_app_type.gsub('-', '_').to_sym
71
- result[assoc_app_type] ||= []
72
-
73
- similar_elem_child.css('.app-left-column-related-snippet-container').each do |app_elem|
74
- assoc_app = {}
75
-
76
- assoc_app[:app_id] = app_elem.attributes['data-docid'].text
77
-
78
- result[assoc_app_type] << assoc_app
79
- end
85
+ result[assoc_app_type] << assoc_app
80
86
  end
81
87
  end
88
+ # Users also installed is no longer on the page, adding this for backwards compatibility, well, sort of....
89
+ result[:users_also_installed] = result[:related]
82
90
 
83
- result[:banner_icon_url] = doc.css('.doc-banner-icon img').first.attributes['src'].value
84
-
85
- if image_elem = doc.css('.doc-banner-image-container img').first
86
- result[:banner_image_url] = image_elem.attributes['src'].value
87
- else
88
- result[:banner_image_url] = nil
91
+ node = doc.css('.cover-image').first
92
+ unless node.nil?
93
+ result[:banner_icon_url] = node[:src]
94
+ result[:banner_image_url] = node[:src]
89
95
  end
90
96
 
91
- if website_elem = doc.css('a').select{ |l| l.text.include?("Visit Developer's Website")}.first
92
- redirect_url = website_elem.attribute('href').value
93
-
94
- if q_param = URI(redirect_url).query.split('&').select{ |p| p =~ /q=/ }.first
95
- actual_url = q_param.gsub('q=', '')
97
+ result[:youtube_video_ids] = []
98
+ doc.css('.play-click-target').each do |node|
99
+ url = node['data-video-url']
100
+ unless url.nil?
101
+ result[:youtube_video_ids] << url.split('embed/').last.split('?').first
96
102
  end
97
-
98
- result[:website_url] = actual_url
99
- end
100
-
101
- if email_elem = doc.css('a').select{ |l| l.text.include?("Email Developer")}.first
102
- result[:email] = email_elem.attribute('href').value.gsub(/^mailto:/, '')
103
- end
104
-
105
- unless (video_section_elem = doc.css('.doc-video-section')).empty?
106
- urls = video_section_elem.children.css('embed').map{ |e| e.attribute('src').value }
107
- result[:youtube_video_ids] = urls.map{ |u| /youtube\.com\/v\/(.*)\?/.match(u)[1] }
108
- else
109
- result[:youtube_video_ids] = []
110
103
  end
111
104
 
112
- screenshots = doc.css('.screenshot-carousel-content-container img')
113
-
114
- if screenshots && screenshots.length > 0
115
- result[:screenshot_urls] = screenshots.map { |s| s.attributes['src'].value }
116
- else
117
- result[:screenshot_urls] = []
105
+ result[:screenshot_urls] = []
106
+ doc.css('.screenshot').each do |node|
107
+ result[:screenshot_urls] << node[:src]
118
108
  end
119
109
 
120
- result[:whats_new] = doc.css('.doc-whatsnew-container').inner_html
110
+ node = doc.css('.whatsnew').first
111
+ result[:whats_new] = node.inner_html.strip unless node.nil?
121
112
 
113
+ # Stubbing out for now, can't find them in the redesigned page.
122
114
  result[:permissions] = permissions = []
123
- perm_types = ['dangerous', 'safe']
124
- perm_types.each do |type|
125
- doc.css("#doc-permissions-#{type} .doc-permission-group").each do |group_elem|
126
- title = group_elem.css('.doc-permission-group-title').text
127
- group_elem.css('.doc-permission-description').each do |desc_elem|
128
- #permissions << { :security => type, :group => title, :description => desc_elem.text }
129
- end
130
- descriptions = group_elem.css('.doc-permission-description').map { |e| e.text }
131
- descriptions_full = group_elem.css('.doc-permission-description-full').map { |e| e.text }
132
- (0...descriptions.length).each do |i|
133
- desc = descriptions[i]
134
- desc_full = descriptions_full[i]
135
- permissions << { :security => type, :group => title, :description => desc, :description_full => desc_full }
136
- end
137
- end
138
- end
139
115
 
140
116
  result[:rating_distribution] = { 5 => nil, 4 => nil, 3 => nil, 2 => nil, 1 => nil }
141
117
 
142
- if (histogram = doc.css('div.histogram-table').first)
143
- cur_index = 5
144
- histogram.css('tr').each do |e|
145
- result[:rating_distribution][cur_index] = e.children.last.inner_text.gsub(/[^0-9]/, '').to_i
146
- cur_index -= 1
147
- end
118
+ histogram = doc.css('div.rating-histogram')
119
+ cur_index = 5
120
+ %w(five four three two one).each do |slot|
121
+ node = histogram.css(".#{slot.to_s}")
122
+ result[:rating_distribution][cur_index] = node.css('.bar-number').text.to_i
123
+ cur_index -= 1
124
+
148
125
  end
149
126
 
150
127
  result[:html] = html
@@ -30,7 +30,11 @@ module MarketBot
30
30
  end
31
31
 
32
32
  result[:title] = details_node.css('.title').first.attributes['title'].to_s
33
- result[:price_usd] = details_node.css('.buy-button-price').children.first.text.gsub(' Buy', '')
33
+
34
+ if (price_elem = details_node.css('.buy-button-price').children.first)
35
+ result[:price_usd] = price_elem.text.gsub(' Buy', '')
36
+ end
37
+
34
38
  result[:developer] = details_node.css('.attribution').children.first.text
35
39
  result[:market_id] = details_node.css('.title').first.attributes['href'].to_s.gsub('/store/apps/details?id=', '').gsub(/&feature=.*$/, '')
36
40
  result[:market_url] = "https://play.google.com/store/apps/details?id=#{result[:market_id]}&hl=en"
@@ -69,6 +73,7 @@ module MarketBot
69
73
  @hydra = options[:hydra] || MarketBot.hydra
70
74
  @request_opts = options[:request_opts] || {}
71
75
  @parsed_results = []
76
+ @pending_pages = []
72
77
  end
73
78
 
74
79
  def market_urls(options={})
@@ -92,7 +97,8 @@ module MarketBot
92
97
  results
93
98
  end
94
99
 
95
- def enqueue_update(options={})
100
+ def enqueue_update(options={},&block)
101
+ @callback = block
96
102
  if @identifier.to_s.downcase == 'editors_choice' && category == nil
97
103
  url = 'https://play.google.com/store/apps/collection/editors_choice?&hl=en'
98
104
  process_page(url, 1)
@@ -130,6 +136,7 @@ module MarketBot
130
136
 
131
137
  private
132
138
  def process_page(url, page_num)
139
+ @pending_pages << page_num
133
140
  request = Typhoeus::Request.new(url, @request_opts)
134
141
  request.on_complete do |response|
135
142
  # HACK: Typhoeus <= 0.4.2 returns a response, 0.5.0pre returns the request.
@@ -143,6 +150,8 @@ module MarketBot
143
150
 
144
151
  def update_callback(result, page)
145
152
  @parsed_results[page] = result
153
+ @pending_pages.delete(page)
154
+ @callback.call(self) if @callback and @pending_pages.empty?
146
155
  end
147
156
  end
148
157
 
@@ -4,7 +4,7 @@ module MarketBot
4
4
  # Search query pages are extremely similar to leaderboard pages.
5
5
  # Amazingly, this inheritence hack works!
6
6
  class SearchQuery < MarketBot::Android::Leaderboard
7
- def initialze(query, options={})
7
+ def initialize(query, options={})
8
8
  super(query, nil, options)
9
9
  end
10
10
 
data/market_bot.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "market_bot"
8
- s.version = "0.9.1"
8
+ s.version = "0.10.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Chad Remesch"]
12
- s.date = "2012-10-10"
12
+ s.date = "2013-07-19"
13
13
  s.description = "Market Bot is a high performance Ruby scraper for Google's Android Market with a simple to use API. It is built on top of Nokogiri and Typhoeus."
14
14
  s.email = "chad@remesch.com"
15
15
  s.extra_rdoc_files = [
@@ -51,18 +51,18 @@ Gem::Specification.new do |s|
51
51
  s.homepage = "http://github.com/chadrem/market_bot"
52
52
  s.licenses = ["MIT"]
53
53
  s.require_paths = ["lib"]
54
- s.rubygems_version = "1.8.24"
54
+ s.rubygems_version = "2.0.3"
55
55
  s.summary = "Market Bot: High performance Ruby scraper for Google's Android Market"
56
56
 
57
57
  if s.respond_to? :specification_version then
58
- s.specification_version = 3
58
+ s.specification_version = 4
59
59
 
60
60
  if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
61
- s.add_runtime_dependency(%q<typhoeus>, [">= 0"])
61
+ s.add_runtime_dependency(%q<typhoeus>, ["~> 0.4.2"])
62
62
  s.add_runtime_dependency(%q<nokogiri>, [">= 0"])
63
63
  s.add_development_dependency(%q<rspec>, ["~> 2.8.0"])
64
- s.add_development_dependency(%q<bundler>, ["~> 1.1.0"])
65
- s.add_development_dependency(%q<jeweler>, ["~> 1.6.4"])
64
+ s.add_development_dependency(%q<bundler>, ["~> 1.3.0"])
65
+ s.add_development_dependency(%q<jeweler>, ["~> 1.8.0"])
66
66
  s.add_development_dependency(%q<simplecov>, [">= 0"])
67
67
  s.add_development_dependency(%q<rdoc>, [">= 3.9.4"])
68
68
  s.add_development_dependency(%q<guard>, ["~> 1.2.3"])
@@ -70,11 +70,11 @@ Gem::Specification.new do |s|
70
70
  s.add_development_dependency(%q<growl>, [">= 0"])
71
71
  s.add_development_dependency(%q<debugger>, [">= 0"])
72
72
  else
73
- s.add_dependency(%q<typhoeus>, [">= 0"])
73
+ s.add_dependency(%q<typhoeus>, ["~> 0.4.2"])
74
74
  s.add_dependency(%q<nokogiri>, [">= 0"])
75
75
  s.add_dependency(%q<rspec>, ["~> 2.8.0"])
76
- s.add_dependency(%q<bundler>, ["~> 1.1.0"])
77
- s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
76
+ s.add_dependency(%q<bundler>, ["~> 1.3.0"])
77
+ s.add_dependency(%q<jeweler>, ["~> 1.8.0"])
78
78
  s.add_dependency(%q<simplecov>, [">= 0"])
79
79
  s.add_dependency(%q<rdoc>, [">= 3.9.4"])
80
80
  s.add_dependency(%q<guard>, ["~> 1.2.3"])
@@ -83,11 +83,11 @@ Gem::Specification.new do |s|
83
83
  s.add_dependency(%q<debugger>, [">= 0"])
84
84
  end
85
85
  else
86
- s.add_dependency(%q<typhoeus>, [">= 0"])
86
+ s.add_dependency(%q<typhoeus>, ["~> 0.4.2"])
87
87
  s.add_dependency(%q<nokogiri>, [">= 0"])
88
88
  s.add_dependency(%q<rspec>, ["~> 2.8.0"])
89
- s.add_dependency(%q<bundler>, ["~> 1.1.0"])
90
- s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
89
+ s.add_dependency(%q<bundler>, ["~> 1.3.0"])
90
+ s.add_dependency(%q<jeweler>, ["~> 1.8.0"])
91
91
  s.add_dependency(%q<simplecov>, [">= 0"])
92
92
  s.add_dependency(%q<rdoc>, [">= 3.9.4"])
93
93
  s.add_dependency(%q<guard>, ["~> 1.2.3"])