rubygems-crawler 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,13 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # Define an env variable MONGO_URI like:
4
+ # MONGO_URI='mongodb://localhost:27017/'
5
+
6
+ require 'rubygems-crawler/gems_crawler'
7
+
8
+ mongo_uri = ENV['MONGO_URI'] || 'mongodb://localhost:27017/'
9
+ mongo_client = Mongo::MongoClient.from_uri(mongo_uri)
10
+ mongo_db = mongo_client.db('rubygems')
11
+
12
+ g = RubyGems::GemsCrawler.new mongo_db
13
+ g.crawl_from 'a'
@@ -0,0 +1,13 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # Define an env variable MONGO_URI like:
4
+ # MONGO_URI='mongodb://localhost:27017/'
5
+
6
+ require 'rubygems-crawler/web_crawler'
7
+
8
+ mongo_uri = ENV['MONGO_URI'] || 'mongodb://localhost:27017/'
9
+ mongo_client = Mongo::MongoClient.from_uri(mongo_uri)
10
+ mongo_db = mongo_client.db('rubygems')
11
+
12
+ w = RubyGems::WebCrawler.new mongo_db
13
+ ('A'..'Z').each {|seed| w.crawl(seed) }
@@ -0,0 +1,42 @@
1
+ require 'gems'
2
+ require 'mongo'
3
+
4
+ # A very simple gems crawler for Rubygems.org
5
+ module RubyGems
6
+ class GemsCrawler
7
+
8
+ GRACE_PERIOD = 5 # be gentle
9
+
10
+ def initialize(mongo)
11
+ @mongo = mongo
12
+ end
13
+
14
+ def crawl_from(initial_name='a')
15
+ #name: {'$gte' => initial_name} - to filter by name
16
+ @mongo[:gems].find({owners: nil}, {fields: ["name"]}).each_slice(10) do |bulk|
17
+ bulk.each do |mongo_doc|
18
+ crawl(mongo_doc['name'])
19
+ sleep GRACE_PERIOD #be nice
20
+ end
21
+ end
22
+ end
23
+
24
+ def crawl(gem_name)
25
+ STDOUT.puts "[RubyGems Web Crawler] Acquiring data for gem #{gem_name}"
26
+
27
+ gem_object = Gems.info(gem_name)
28
+ gem_object['versions'] = Gems.versions(gem_name)
29
+ gem_object['owners'] = Gems.owners(gem_name)
30
+
31
+ save(gem_object)
32
+ rescue
33
+ STDERR.puts "[RubyGems Web Crawler] Error while acquiring data for gem #{gem_name}"
34
+ end
35
+
36
+ # Save all the gem data into Mongo
37
+ def save(gem_object)
38
+ @mongo[:gems].find_and_modify(query: {name: gem_object['name']}, update: gem_object)
39
+ end
40
+
41
+ end
42
+ end
@@ -0,0 +1,90 @@
1
+ require 'net/http'
2
+ require 'nokogiri'
3
+ require 'mongo'
4
+
5
+ # A very simple web crawler for Rubygems.org
6
+ module RubyGems
7
+ class WebCrawler
8
+
9
+ BASE_URL = 'http://rubygems.org'
10
+ REQUEST_HEADERS = {'User-Agent'=>'rubygems-crawler'}
11
+ TIMEOUT=30
12
+ GRACE_PERIOD=1 #Sleep for a while - be gentle
13
+
14
+ def initialize(mongo)
15
+ @mongo = mongo
16
+ end
17
+
18
+ # Crawl all the pages of RubyGems, given an initial letter and save the data into MongoDB
19
+ def crawl(letter='A')
20
+ url = "#{BASE_URL}/gems?letter=#{letter}"
21
+ while url && gems = download_page(url)
22
+ save_gems(gems[:gems])
23
+ STDOUT.puts "[RubyGems Web Crawler] [#{url}] - Acquired #{gems[:gems].count} gems"
24
+
25
+ url = (gems[:next_path]) ? "#{BASE_URL}#{gems[:next_path]}" : nil
26
+ sleep GRACE_PERIOD
27
+ end
28
+ end
29
+
30
+ # Download an HTML page given an url, parse the HTML and convert the result back into an HASH
31
+ def download_page(url)
32
+ STDOUT.puts "Acquiring #{url}"
33
+
34
+ network_res = network_call(url, REQUEST_HEADERS, TIMEOUT)
35
+ return parse_content(network_res[:response]) if network_res && network_res[:response]
36
+ end
37
+
38
+ #Execute a GET HTTP call to url given the specified headers
39
+ def network_call(url, request_headers={}, timeout = nil)
40
+
41
+ retries = 0
42
+ begin
43
+ uri = URI.parse(url.ascii_only? ? url : URI.escape(url))
44
+ http = Net::HTTP.new(uri.host, uri.port)
45
+
46
+ unless timeout.nil?
47
+ http.open_timeout = timeout
48
+ http.read_timeout = timeout
49
+ end
50
+
51
+ request = Net::HTTP::Get.new(uri.request_uri, request_headers)
52
+ response = http.request(request)
53
+
54
+ rescue Timeout::Error, Net::HTTPBadResponse, EOFError => e
55
+ retries += 1
56
+ retry unless retries > 3
57
+ return {error: e, code: 0}
58
+ end
59
+
60
+ result = {:code=>response.code.to_i}
61
+ result[:response] = response.body if response.code.to_s == '200'
62
+ result
63
+ end
64
+
65
+ # Parse the HTML of the page extracting gem names and total number of pages
66
+ def parse_content(response)
67
+ gem_res = {:gems => [], :next_path => nil}
68
+
69
+ html_doc = Nokogiri::HTML(response)
70
+
71
+ html_doc.css('.gems li a>strong').each do |node|
72
+ node.content =~ /(.+)\s\(.+\)/
73
+ gem_res[:gems] << $1
74
+ end
75
+
76
+ next_page = html_doc.css('.next_page').first
77
+ if next_page
78
+ gem_res[:next_path] = next_page.attr('href')
79
+ end
80
+
81
+ gem_res
82
+ end
83
+
84
+ # Save all the gem names into Mongo
85
+ def save_gems(gems)
86
+ gems.each {|gem_name| @mongo[:gems].insert({name: gem_name}) }
87
+ end
88
+
89
+ end
90
+ end
metadata ADDED
@@ -0,0 +1,138 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rubygems-crawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Luca Bonmassar
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-10-28 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: nokogiri
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: 1.5.5
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: 1.5.5
30
+ - !ruby/object:Gem::Dependency
31
+ name: mongo
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ~>
36
+ - !ruby/object:Gem::Version
37
+ version: 1.8.0
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ~>
44
+ - !ruby/object:Gem::Version
45
+ version: 1.8.0
46
+ - !ruby/object:Gem::Dependency
47
+ name: bson_ext
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ~>
52
+ - !ruby/object:Gem::Version
53
+ version: 1.8.0
54
+ type: :runtime
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ~>
60
+ - !ruby/object:Gem::Version
61
+ version: 1.8.0
62
+ - !ruby/object:Gem::Dependency
63
+ name: gems
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ~>
68
+ - !ruby/object:Gem::Version
69
+ version: 0.8.3
70
+ type: :runtime
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ~>
76
+ - !ruby/object:Gem::Version
77
+ version: 0.8.3
78
+ - !ruby/object:Gem::Dependency
79
+ name: rake
80
+ requirement: !ruby/object:Gem::Requirement
81
+ none: false
82
+ requirements:
83
+ - - ! '>='
84
+ - !ruby/object:Gem::Version
85
+ version: '0'
86
+ type: :development
87
+ prerelease: false
88
+ version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ! '>='
92
+ - !ruby/object:Gem::Version
93
+ version: '0'
94
+ description: A very simple crawler for RubyGems.org used to demo the power of ElasticSearch
95
+ at RubyConf 2013
96
+ email:
97
+ - luca@gild.com
98
+ executables:
99
+ - rubygems-gems-crawler
100
+ - rubygems-web-crawler
101
+ extensions: []
102
+ extra_rdoc_files: []
103
+ files:
104
+ - lib/rubygems-crawler/gems_crawler.rb
105
+ - lib/rubygems-crawler/web_crawler.rb
106
+ - bin/rubygems-gems-crawler
107
+ - bin/rubygems-web-crawler
108
+ homepage: http://www.gild.com
109
+ licenses: []
110
+ post_install_message:
111
+ rdoc_options: []
112
+ require_paths:
113
+ - lib
114
+ required_ruby_version: !ruby/object:Gem::Requirement
115
+ none: false
116
+ requirements:
117
+ - - ! '>='
118
+ - !ruby/object:Gem::Version
119
+ version: '0'
120
+ segments:
121
+ - 0
122
+ hash: -4028937865614676331
123
+ required_rubygems_version: !ruby/object:Gem::Requirement
124
+ none: false
125
+ requirements:
126
+ - - ! '>='
127
+ - !ruby/object:Gem::Version
128
+ version: '0'
129
+ segments:
130
+ - 0
131
+ hash: -4028937865614676331
132
+ requirements: []
133
+ rubyforge_project:
134
+ rubygems_version: 1.8.24
135
+ signing_key:
136
+ specification_version: 3
137
+ summary: A very simple crawler for RubyGems.org
138
+ test_files: []