market_bot 0.9.1 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 9688ff41be35247c4bb0f85621bb800232111936
4
+ data.tar.gz: 368e41730331ab5f6084ce49d6a51cbe87904d51
5
+ SHA512:
6
+ metadata.gz: 34de9be2e58bfeab53156bd5b3248fcd412918616431ab03b3521b5065f55d0b07deb913f7f08a07cf512b5151dc1c8859a5e52f70c1e1c84cd86a9129d57fd3
7
+ data.tar.gz: 96e62fed37bee0a7ee35725a45470a3f912e7651e3868ae102f6135a7d6666e39abab95a76465e5849dafd86384745984bdebfb87949d792fbd50d11a393ef6a
data/Gemfile CHANGED
@@ -1,12 +1,12 @@
1
1
  source 'http://rubygems.org'
2
2
 
3
- gem 'typhoeus'
3
+ gem 'typhoeus', '~> 0.4.2'
4
4
  gem 'nokogiri'
5
5
 
6
6
  group :development do
7
7
  gem 'rspec', '~> 2.8.0'
8
- gem 'bundler', '~> 1.1.0'
9
- gem 'jeweler', '~> 1.6.4'
8
+ gem 'bundler', '~> 1.3.0'
9
+ gem 'jeweler', '~> 1.8.0'
10
10
  gem 'simplecov', '>= 0'
11
11
  gem 'rdoc', '>= 3.9.4'
12
12
  gem 'guard', '~> 1.2.3'
data/Gemfile.lock CHANGED
@@ -1,42 +1,72 @@
1
1
  GEM
2
2
  remote: http://rubygems.org/
3
3
  specs:
4
+ addressable (2.3.5)
5
+ builder (3.2.2)
4
6
  columnize (0.3.6)
5
- debugger (1.1.4)
7
+ debugger (1.6.1)
6
8
  columnize (>= 0.3.1)
7
- debugger-linecache (~> 1.1.1)
8
- debugger-ruby_core_source (~> 1.1.3)
9
- debugger-linecache (1.1.1)
10
- debugger-ruby_core_source (>= 1.1.1)
11
- debugger-ruby_core_source (1.1.3)
9
+ debugger-linecache (~> 1.2.0)
10
+ debugger-ruby_core_source (~> 1.2.3)
11
+ debugger-linecache (1.2.0)
12
+ debugger-ruby_core_source (1.2.3)
12
13
  diff-lcs (1.1.3)
13
- ffi (1.0.11)
14
+ faraday (0.8.7)
15
+ multipart-post (~> 1.1)
16
+ ffi (1.9.0)
14
17
  git (1.2.5)
18
+ github_api (0.10.1)
19
+ addressable
20
+ faraday (~> 0.8.1)
21
+ hashie (>= 1.2)
22
+ multi_json (~> 1.4)
23
+ nokogiri (~> 1.5.2)
24
+ oauth2
15
25
  growl (1.0.3)
16
26
  guard (1.2.3)
17
27
  listen (>= 0.4.2)
18
28
  thor (>= 0.14.6)
19
29
  guard-rspec (1.1.0)
20
30
  guard (>= 1.1)
21
- jeweler (1.6.4)
31
+ hashie (2.0.5)
32
+ highline (1.6.19)
33
+ httpauth (0.2.0)
34
+ jeweler (1.8.6)
35
+ builder
22
36
  bundler (~> 1.0)
23
37
  git (>= 1.2.5)
38
+ github_api (= 0.10.1)
39
+ highline (>= 1.6.15)
40
+ nokogiri (= 1.5.10)
24
41
  rake
25
- json (1.7.3)
26
- listen (0.4.7)
27
- rb-fchange (~> 0.0.5)
28
- rb-fsevent (~> 0.9.1)
29
- rb-inotify (~> 0.8.8)
30
- mime-types (1.18)
31
- multi_json (1.3.6)
32
- nokogiri (1.5.4)
33
- rake (0.9.2.2)
34
- rb-fchange (0.0.5)
35
- ffi
36
- rb-fsevent (0.9.1)
37
- rb-inotify (0.8.8)
42
+ rdoc
43
+ json (1.8.0)
44
+ jwt (0.1.8)
45
+ multi_json (>= 1.5)
46
+ listen (1.2.2)
47
+ rb-fsevent (>= 0.9.3)
48
+ rb-inotify (>= 0.9)
49
+ rb-kqueue (>= 0.2)
50
+ mime-types (1.23)
51
+ multi_json (1.7.7)
52
+ multi_xml (0.5.4)
53
+ multipart-post (1.2.0)
54
+ nokogiri (1.5.10)
55
+ oauth2 (0.9.2)
56
+ faraday (~> 0.8)
57
+ httpauth (~> 0.2)
58
+ jwt (~> 0.1.4)
59
+ multi_json (~> 1.0)
60
+ multi_xml (~> 0.5)
61
+ rack (~> 1.2)
62
+ rack (1.5.2)
63
+ rake (10.1.0)
64
+ rb-fsevent (0.9.3)
65
+ rb-inotify (0.9.0)
66
+ ffi (>= 0.5.0)
67
+ rb-kqueue (0.2.0)
38
68
  ffi (>= 0.5.0)
39
- rdoc (3.12)
69
+ rdoc (4.0.1)
40
70
  json (~> 1.4)
41
71
  rspec (2.8.0)
42
72
  rspec-core (~> 2.8.0)
@@ -46,11 +76,11 @@ GEM
46
76
  rspec-expectations (2.8.0)
47
77
  diff-lcs (~> 1.1.2)
48
78
  rspec-mocks (2.8.0)
49
- simplecov (0.6.4)
79
+ simplecov (0.7.1)
50
80
  multi_json (~> 1.0)
51
- simplecov-html (~> 0.5.3)
52
- simplecov-html (0.5.3)
53
- thor (0.15.4)
81
+ simplecov-html (~> 0.7.1)
82
+ simplecov-html (0.7.1)
83
+ thor (0.18.1)
54
84
  typhoeus (0.4.2)
55
85
  ffi (~> 1.0)
56
86
  mime-types (~> 1.18)
@@ -59,14 +89,14 @@ PLATFORMS
59
89
  ruby
60
90
 
61
91
  DEPENDENCIES
62
- bundler (~> 1.1.0)
92
+ bundler (~> 1.3.0)
63
93
  debugger
64
94
  growl
65
95
  guard (~> 1.2.3)
66
96
  guard-rspec (~> 1.1.0)
67
- jeweler (~> 1.6.4)
97
+ jeweler (~> 1.8.0)
68
98
  nokogiri
69
99
  rdoc (>= 3.9.4)
70
100
  rspec (~> 2.8.0)
71
101
  simplecov
72
- typhoeus
102
+ typhoeus (~> 0.4.2)
data/README.markdown CHANGED
@@ -6,6 +6,10 @@ Books, music, movies, etc aren't currently supported.
6
6
  It is built on top of Nokogiri and Typhoeus.
7
7
  Used in production to power [www.droidmeter.com](http://www.droidmeter.com/?t=github).
8
8
 
9
+ **This project is currently seeking developers to help maintain it.
10
+ Please send pull requests or contact me if you are able to help out.
11
+ The app scraper is known to work, but leader board functionality is currently broken.**
12
+
9
13
  ## Dependencies
10
14
 
11
15
  * Nokogiri
@@ -68,6 +72,11 @@ Used in production to power [www.droidmeter.com](http://www.droidmeter.com/?t=gi
68
72
  puts "First place app (#{first_app.title}) price: #{first_app.price}" unless first_app.error
69
73
  puts "Last place app (#{last_app.title}) price: #{last_app.price}" unless last_app.error
70
74
 
75
+ ## Excessive Use
76
+
77
+ Google will block your IP address if you attempt to scrape large quantities of data.
78
+ Please contact me if you are looking for commercial data solutions.
79
+
71
80
  ## Contributing to Market Bot
72
81
 
73
82
  1. Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.9.1
1
+ 0.10.0
@@ -19,132 +19,109 @@ module MarketBot
19
19
  result = {}
20
20
 
21
21
  doc = Nokogiri::HTML(html)
22
-
23
- elements = doc.css('.doc-metadata').first.elements[2].elements
24
- elem_count = elements.count
25
-
26
- (3..(elem_count - 1)).select{ |n| n.odd? }.each do |i|
27
- field_name = elements[i].text
22
+ meta_info = doc.css('.meta-info')
23
+ meta_info.each do |info|
24
+ field_name = info.css('.title').text.strip
28
25
 
29
26
  case field_name
30
- when 'Updated:'
31
- result[:updated] = elements[i + 1].text
32
- when 'Current Version:'
33
- result[:current_version] = elements[i + 1].text
34
- when 'Requires Android:'
35
- result[:requires_android] = elements[i + 1].text
36
- when 'Category:'
37
- result[:category] = elements[i + 1].text
38
- when 'Installs:'
39
- result[:installs] = elements[i + 1].children.first.text
40
- when 'Size:'
41
- result[:size] = elements[i + 1].text
42
- when 'Price:'
43
- result[:price] = elements[i + 1].text
44
- when 'Content Rating:'
45
- result[:content_rating] = elements[i + 1].text
27
+ when 'Updated'
28
+ result[:updated] = info.css('.content').text.strip
29
+ when 'Installs'
30
+ result[:installs] = info.css('.content').text.strip
31
+ when 'Size'
32
+ result[:size] = info.css('.content').text.strip
33
+ when 'Current Version'
34
+ result[:current_version] = info.css('.content').text.strip
35
+ when 'Requires Android'
36
+ result[:requires_android] = info.css('.content').text.strip
37
+ when 'Content Rating'
38
+ result[:content_rating] = info.css('.content').text.strip
39
+ when 'Contact Developer'
40
+ info.css('.dev-link').each do |node|
41
+ if node.text.strip.eql? 'Email Developer'
42
+ result[:email] = node[:href].gsub(/^mailto:/,'')
43
+ else
44
+ redirect_url = node[:href]
45
+ if q_param = URI(redirect_url).query.split('&').select{ |p| p =~ /q=/ }.first
46
+ actual_url = q_param.gsub('q=', '')
47
+ end
48
+
49
+ result[:website_url] = actual_url
50
+ end
51
+ end
52
+
46
53
  end
47
54
  end
48
55
 
49
- result[:description] = doc.css('#doc-original-text').first.inner_html
50
- result[:title] = doc.css('.doc-banner-title').text
56
+ node = doc.xpath("//meta[@itemprop='price']").first
57
+ result[:price] = node[:content].strip rescue 'Free'
51
58
 
52
- rating_elem = doc.css('.average-rating-value')
53
- result[:rating] = rating_elem.first.text unless rating_elem.empty?
59
+ result[:category] = doc.css('.category').first.text.strip rescue ''
60
+ result[:description] = doc.xpath("//div[@itemprop='description']").first.inner_html.strip
61
+ result[:title] = doc.xpath("//div[@itemprop='name']").first.text.strip
54
62
 
55
- votes_elem = doc.css('.votes')
56
- result[:votes] = doc.css('.votes').first.text unless votes_elem.empty?
63
+ score = doc.css('.score-container').first
64
+ unless score.nil?
65
+ node = score.css('.score').first
66
+ result[:rating] = node.text.strip
67
+ node = score.xpath("//meta[@itemprop='ratingCount']").first
68
+ result[:votes] = node[:content].strip
69
+ end
57
70
 
58
- result[:developer] = doc.css('.doc-banner-title-container a').text
71
+ node = doc.xpath("//div[@itemprop='author']")
72
+ result[:developer] = node.css('.primary').first.text.strip
59
73
 
60
74
  result[:more_from_developer] = []
61
75
  result[:users_also_installed] = []
62
76
  result[:related] = []
63
77
 
64
- if similar_elem = doc.css('.doc-similar').first
65
- similar_elem.children.each do |similar_elem_child|
66
- assoc_app_type = similar_elem_child.attributes['data-analyticsid'].text
67
-
68
- next unless %w(more-from-developer users-also-installed related).include?(assoc_app_type)
78
+ node = doc.css('.recommendation')
79
+ node.css('.rec-cluster').each do |recommended|
80
+ assoc_app_type = recommended.css('.heading').first.text.strip.eql?('Similar' ) ? :related : :more_from_developer
81
+ recommended.css('.card').each do |card|
82
+ assoc_app = {}
83
+ assoc_app[:app_id] = card['data-docid'].strip
69
84
 
70
- assoc_app_type = assoc_app_type.gsub('-', '_').to_sym
71
- result[assoc_app_type] ||= []
72
-
73
- similar_elem_child.css('.app-left-column-related-snippet-container').each do |app_elem|
74
- assoc_app = {}
75
-
76
- assoc_app[:app_id] = app_elem.attributes['data-docid'].text
77
-
78
- result[assoc_app_type] << assoc_app
79
- end
85
+ result[assoc_app_type] << assoc_app
80
86
  end
81
87
  end
88
+ # Users also installed is no longer on the page, adding this for backwards compatibility, well, sort of....
89
+ result[:users_also_installed] = result[:related]
82
90
 
83
- result[:banner_icon_url] = doc.css('.doc-banner-icon img').first.attributes['src'].value
84
-
85
- if image_elem = doc.css('.doc-banner-image-container img').first
86
- result[:banner_image_url] = image_elem.attributes['src'].value
87
- else
88
- result[:banner_image_url] = nil
91
+ node = doc.css('.cover-image').first
92
+ unless node.nil?
93
+ result[:banner_icon_url] = node[:src]
94
+ result[:banner_image_url] = node[:src]
89
95
  end
90
96
 
91
- if website_elem = doc.css('a').select{ |l| l.text.include?("Visit Developer's Website")}.first
92
- redirect_url = website_elem.attribute('href').value
93
-
94
- if q_param = URI(redirect_url).query.split('&').select{ |p| p =~ /q=/ }.first
95
- actual_url = q_param.gsub('q=', '')
97
+ result[:youtube_video_ids] = []
98
+ doc.css('.play-click-target').each do |node|
99
+ url = node['data-video-url']
100
+ unless url.nil?
101
+ result[:youtube_video_ids] << url.split('embed/').last.split('?').first
96
102
  end
97
-
98
- result[:website_url] = actual_url
99
- end
100
-
101
- if email_elem = doc.css('a').select{ |l| l.text.include?("Email Developer")}.first
102
- result[:email] = email_elem.attribute('href').value.gsub(/^mailto:/, '')
103
- end
104
-
105
- unless (video_section_elem = doc.css('.doc-video-section')).empty?
106
- urls = video_section_elem.children.css('embed').map{ |e| e.attribute('src').value }
107
- result[:youtube_video_ids] = urls.map{ |u| /youtube\.com\/v\/(.*)\?/.match(u)[1] }
108
- else
109
- result[:youtube_video_ids] = []
110
103
  end
111
104
 
112
- screenshots = doc.css('.screenshot-carousel-content-container img')
113
-
114
- if screenshots && screenshots.length > 0
115
- result[:screenshot_urls] = screenshots.map { |s| s.attributes['src'].value }
116
- else
117
- result[:screenshot_urls] = []
105
+ result[:screenshot_urls] = []
106
+ doc.css('.screenshot').each do |node|
107
+ result[:screenshot_urls] << node[:src]
118
108
  end
119
109
 
120
- result[:whats_new] = doc.css('.doc-whatsnew-container').inner_html
110
+ node = doc.css('.whatsnew').first
111
+ result[:whats_new] = node.inner_html.strip unless node.nil?
121
112
 
113
+ # Stubbing out for now, can't find them in the redesigned page.
122
114
  result[:permissions] = permissions = []
123
- perm_types = ['dangerous', 'safe']
124
- perm_types.each do |type|
125
- doc.css("#doc-permissions-#{type} .doc-permission-group").each do |group_elem|
126
- title = group_elem.css('.doc-permission-group-title').text
127
- group_elem.css('.doc-permission-description').each do |desc_elem|
128
- #permissions << { :security => type, :group => title, :description => desc_elem.text }
129
- end
130
- descriptions = group_elem.css('.doc-permission-description').map { |e| e.text }
131
- descriptions_full = group_elem.css('.doc-permission-description-full').map { |e| e.text }
132
- (0...descriptions.length).each do |i|
133
- desc = descriptions[i]
134
- desc_full = descriptions_full[i]
135
- permissions << { :security => type, :group => title, :description => desc, :description_full => desc_full }
136
- end
137
- end
138
- end
139
115
 
140
116
  result[:rating_distribution] = { 5 => nil, 4 => nil, 3 => nil, 2 => nil, 1 => nil }
141
117
 
142
- if (histogram = doc.css('div.histogram-table').first)
143
- cur_index = 5
144
- histogram.css('tr').each do |e|
145
- result[:rating_distribution][cur_index] = e.children.last.inner_text.gsub(/[^0-9]/, '').to_i
146
- cur_index -= 1
147
- end
118
+ histogram = doc.css('div.rating-histogram')
119
+ cur_index = 5
120
+ %w(five four three two one).each do |slot|
121
+ node = histogram.css(".#{slot.to_s}")
122
+ result[:rating_distribution][cur_index] = node.css('.bar-number').text.to_i
123
+ cur_index -= 1
124
+
148
125
  end
149
126
 
150
127
  result[:html] = html
@@ -30,7 +30,11 @@ module MarketBot
30
30
  end
31
31
 
32
32
  result[:title] = details_node.css('.title').first.attributes['title'].to_s
33
- result[:price_usd] = details_node.css('.buy-button-price').children.first.text.gsub(' Buy', '')
33
+
34
+ if (price_elem = details_node.css('.buy-button-price').children.first)
35
+ result[:price_usd] = price_elem.text.gsub(' Buy', '')
36
+ end
37
+
34
38
  result[:developer] = details_node.css('.attribution').children.first.text
35
39
  result[:market_id] = details_node.css('.title').first.attributes['href'].to_s.gsub('/store/apps/details?id=', '').gsub(/&feature=.*$/, '')
36
40
  result[:market_url] = "https://play.google.com/store/apps/details?id=#{result[:market_id]}&hl=en"
@@ -69,6 +73,7 @@ module MarketBot
69
73
  @hydra = options[:hydra] || MarketBot.hydra
70
74
  @request_opts = options[:request_opts] || {}
71
75
  @parsed_results = []
76
+ @pending_pages = []
72
77
  end
73
78
 
74
79
  def market_urls(options={})
@@ -92,7 +97,8 @@ module MarketBot
92
97
  results
93
98
  end
94
99
 
95
- def enqueue_update(options={})
100
+ def enqueue_update(options={},&block)
101
+ @callback = block
96
102
  if @identifier.to_s.downcase == 'editors_choice' && category == nil
97
103
  url = 'https://play.google.com/store/apps/collection/editors_choice?&hl=en'
98
104
  process_page(url, 1)
@@ -130,6 +136,7 @@ module MarketBot
130
136
 
131
137
  private
132
138
  def process_page(url, page_num)
139
+ @pending_pages << page_num
133
140
  request = Typhoeus::Request.new(url, @request_opts)
134
141
  request.on_complete do |response|
135
142
  # HACK: Typhoeus <= 0.4.2 returns a response, 0.5.0pre returns the request.
@@ -143,6 +150,8 @@ module MarketBot
143
150
 
144
151
  def update_callback(result, page)
145
152
  @parsed_results[page] = result
153
+ @pending_pages.delete(page)
154
+ @callback.call(self) if @callback and @pending_pages.empty?
146
155
  end
147
156
  end
148
157
 
@@ -4,7 +4,7 @@ module MarketBot
4
4
  # Search query pages are extremely similar to leaderboard pages.
5
5
  # Amazingly, this inheritence hack works!
6
6
  class SearchQuery < MarketBot::Android::Leaderboard
7
- def initialze(query, options={})
7
+ def initialize(query, options={})
8
8
  super(query, nil, options)
9
9
  end
10
10
 
data/market_bot.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "market_bot"
8
- s.version = "0.9.1"
8
+ s.version = "0.10.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Chad Remesch"]
12
- s.date = "2012-10-10"
12
+ s.date = "2013-07-19"
13
13
  s.description = "Market Bot is a high performance Ruby scraper for Google's Android Market with a simple to use API. It is built on top of Nokogiri and Typhoeus."
14
14
  s.email = "chad@remesch.com"
15
15
  s.extra_rdoc_files = [
@@ -51,18 +51,18 @@ Gem::Specification.new do |s|
51
51
  s.homepage = "http://github.com/chadrem/market_bot"
52
52
  s.licenses = ["MIT"]
53
53
  s.require_paths = ["lib"]
54
- s.rubygems_version = "1.8.24"
54
+ s.rubygems_version = "2.0.3"
55
55
  s.summary = "Market Bot: High performance Ruby scraper for Google's Android Market"
56
56
 
57
57
  if s.respond_to? :specification_version then
58
- s.specification_version = 3
58
+ s.specification_version = 4
59
59
 
60
60
  if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
61
- s.add_runtime_dependency(%q<typhoeus>, [">= 0"])
61
+ s.add_runtime_dependency(%q<typhoeus>, ["~> 0.4.2"])
62
62
  s.add_runtime_dependency(%q<nokogiri>, [">= 0"])
63
63
  s.add_development_dependency(%q<rspec>, ["~> 2.8.0"])
64
- s.add_development_dependency(%q<bundler>, ["~> 1.1.0"])
65
- s.add_development_dependency(%q<jeweler>, ["~> 1.6.4"])
64
+ s.add_development_dependency(%q<bundler>, ["~> 1.3.0"])
65
+ s.add_development_dependency(%q<jeweler>, ["~> 1.8.0"])
66
66
  s.add_development_dependency(%q<simplecov>, [">= 0"])
67
67
  s.add_development_dependency(%q<rdoc>, [">= 3.9.4"])
68
68
  s.add_development_dependency(%q<guard>, ["~> 1.2.3"])
@@ -70,11 +70,11 @@ Gem::Specification.new do |s|
70
70
  s.add_development_dependency(%q<growl>, [">= 0"])
71
71
  s.add_development_dependency(%q<debugger>, [">= 0"])
72
72
  else
73
- s.add_dependency(%q<typhoeus>, [">= 0"])
73
+ s.add_dependency(%q<typhoeus>, ["~> 0.4.2"])
74
74
  s.add_dependency(%q<nokogiri>, [">= 0"])
75
75
  s.add_dependency(%q<rspec>, ["~> 2.8.0"])
76
- s.add_dependency(%q<bundler>, ["~> 1.1.0"])
77
- s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
76
+ s.add_dependency(%q<bundler>, ["~> 1.3.0"])
77
+ s.add_dependency(%q<jeweler>, ["~> 1.8.0"])
78
78
  s.add_dependency(%q<simplecov>, [">= 0"])
79
79
  s.add_dependency(%q<rdoc>, [">= 3.9.4"])
80
80
  s.add_dependency(%q<guard>, ["~> 1.2.3"])
@@ -83,11 +83,11 @@ Gem::Specification.new do |s|
83
83
  s.add_dependency(%q<debugger>, [">= 0"])
84
84
  end
85
85
  else
86
- s.add_dependency(%q<typhoeus>, [">= 0"])
86
+ s.add_dependency(%q<typhoeus>, ["~> 0.4.2"])
87
87
  s.add_dependency(%q<nokogiri>, [">= 0"])
88
88
  s.add_dependency(%q<rspec>, ["~> 2.8.0"])
89
- s.add_dependency(%q<bundler>, ["~> 1.1.0"])
90
- s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
89
+ s.add_dependency(%q<bundler>, ["~> 1.3.0"])
90
+ s.add_dependency(%q<jeweler>, ["~> 1.8.0"])
91
91
  s.add_dependency(%q<simplecov>, [">= 0"])
92
92
  s.add_dependency(%q<rdoc>, [">= 3.9.4"])
93
93
  s.add_dependency(%q<guard>, ["~> 1.2.3"])