rubygems-crawler 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,13 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # Define an env variable MONGO_URI like:
4
+ # MONGO_URI='mongodb://localhost:27017/'
5
+
6
+ require 'rubygems-crawler/gems_crawler'
7
+
8
+ mongo_uri = ENV['MONGO_URI'] || 'mongodb://localhost:27017/'
9
+ mongo_client = Mongo::MongoClient.from_uri(mongo_uri)
10
+ mongo_db = mongo_client.db('rubygems')
11
+
12
+ g = RubyGems::GemsCrawler.new mongo_db
13
+ g.crawl_from 'a'
@@ -0,0 +1,13 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # Define an env variable MONGO_URI like:
4
+ # MONGO_URI='mongodb://localhost:27017/'
5
+
6
+ require 'rubygems-crawler/web_crawler'
7
+
8
+ mongo_uri = ENV['MONGO_URI'] || 'mongodb://localhost:27017/'
9
+ mongo_client = Mongo::MongoClient.from_uri(mongo_uri)
10
+ mongo_db = mongo_client.db('rubygems')
11
+
12
+ w = RubyGems::WebCrawler.new mongo_db
13
+ ('A'..'Z').each {|seed| w.crawl(seed) }
@@ -0,0 +1,42 @@
1
+ require 'gems'
2
+ require 'mongo'
3
+
4
+ # A very simple gems crawler for Rubygems.org
5
+ module RubyGems
6
+ class GemsCrawler
7
+
8
+ GRACE_PERIOD = 5 # be gentle
9
+
10
+ def initialize(mongo)
11
+ @mongo = mongo
12
+ end
13
+
14
+ def crawl_from(initial_name='a')
15
+ #name: {'$gte' => initial_name} - to filter by name
16
+ @mongo[:gems].find({owners: nil}, {fields: ["name"]}).each_slice(10) do |bulk|
17
+ bulk.each do |mongo_doc|
18
+ crawl(mongo_doc['name'])
19
+ sleep GRACE_PERIOD #be nice
20
+ end
21
+ end
22
+ end
23
+
24
+ def crawl(gem_name)
25
+ STDOUT.puts "[RubyGems Web Crawler] Acquiring data for gem #{gem_name}"
26
+
27
+ gem_object = Gems.info(gem_name)
28
+ gem_object['versions'] = Gems.versions(gem_name)
29
+ gem_object['owners'] = Gems.owners(gem_name)
30
+
31
+ save(gem_object)
32
+ rescue
33
+ STDERR.puts "[RubyGems Web Crawler] Error while acquiring data for gem #{gem_name}"
34
+ end
35
+
36
+ # Save all the gem data into Mongo
37
+ def save(gem_object)
38
+ @mongo[:gems].find_and_modify(query: {name: gem_object['name']}, update: gem_object)
39
+ end
40
+
41
+ end
42
+ end
@@ -0,0 +1,90 @@
1
+ require 'net/http'
2
+ require 'nokogiri'
3
+ require 'mongo'
4
+
5
+ # A very simple web crawler for Rubygems.org
6
+ module RubyGems
7
+ class WebCrawler
8
+
9
+ BASE_URL = 'http://rubygems.org'
10
+ REQUEST_HEADERS = {'User-Agent'=>'rubygems-crawler'}
11
+ TIMEOUT=30
12
+ GRACE_PERIOD=1 #Sleep for a while - be gentle
13
+
14
+ def initialize(mongo)
15
+ @mongo = mongo
16
+ end
17
+
18
+ # Crawl all the pages of RubyGems, given an initial letter and save the data into MongoDB
19
+ def crawl(letter='A')
20
+ url = "#{BASE_URL}/gems?letter=#{letter}"
21
+ while url && gems = download_page(url)
22
+ save_gems(gems[:gems])
23
+ STDOUT.puts "[RubyGems Web Crawler] [#{url}] - Acquired #{gems[:gems].count} gems"
24
+
25
+ url = (gems[:next_path]) ? "#{BASE_URL}#{gems[:next_path]}" : nil
26
+ sleep GRACE_PERIOD
27
+ end
28
+ end
29
+
30
+ # Download an HTML page given an url, parse the HTML and convert the result back into an HASH
31
+ def download_page(url)
32
+ STDOUT.puts "Acquiring #{url}"
33
+
34
+ network_res = network_call(url, REQUEST_HEADERS, TIMEOUT)
35
+ return parse_content(network_res[:response]) if network_res && network_res[:response]
36
+ end
37
+
38
+ #Execute a GET HTTP call to url given the specified headers
39
+ def network_call(url, request_headers={}, timeout = nil)
40
+
41
+ retries = 0
42
+ begin
43
+ uri = URI.parse(url.ascii_only? ? url : URI.escape(url))
44
+ http = Net::HTTP.new(uri.host, uri.port)
45
+
46
+ unless timeout.nil?
47
+ http.open_timeout = timeout
48
+ http.read_timeout = timeout
49
+ end
50
+
51
+ request = Net::HTTP::Get.new(uri.request_uri, request_headers)
52
+ response = http.request(request)
53
+
54
+ rescue Timeout::Error, Net::HTTPBadResponse, EOFError => e
55
+ retries += 1
56
+ retry unless retries > 3
57
+ return {error: e, code: 0}
58
+ end
59
+
60
+ result = {:code=>response.code.to_i}
61
+ result[:response] = response.body if response.code.to_s == '200'
62
+ result
63
+ end
64
+
65
+ # Parse the HTML of the page extracting gem names and total number of pages
66
+ def parse_content(response)
67
+ gem_res = {:gems => [], :next_path => nil}
68
+
69
+ html_doc = Nokogiri::HTML(response)
70
+
71
+ html_doc.css('.gems li a>strong').each do |node|
72
+ node.content =~ /(.+)\s\(.+\)/
73
+ gem_res[:gems] << $1
74
+ end
75
+
76
+ next_page = html_doc.css('.next_page').first
77
+ if next_page
78
+ gem_res[:next_path] = next_page.attr('href')
79
+ end
80
+
81
+ gem_res
82
+ end
83
+
84
+ # Save all the gem names into Mongo
85
+ def save_gems(gems)
86
+ gems.each {|gem_name| @mongo[:gems].insert({name: gem_name}) }
87
+ end
88
+
89
+ end
90
+ end
metadata ADDED
@@ -0,0 +1,138 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rubygems-crawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Luca Bonmassar
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-10-28 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: nokogiri
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: 1.5.5
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: 1.5.5
30
+ - !ruby/object:Gem::Dependency
31
+ name: mongo
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ~>
36
+ - !ruby/object:Gem::Version
37
+ version: 1.8.0
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ~>
44
+ - !ruby/object:Gem::Version
45
+ version: 1.8.0
46
+ - !ruby/object:Gem::Dependency
47
+ name: bson_ext
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ~>
52
+ - !ruby/object:Gem::Version
53
+ version: 1.8.0
54
+ type: :runtime
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ~>
60
+ - !ruby/object:Gem::Version
61
+ version: 1.8.0
62
+ - !ruby/object:Gem::Dependency
63
+ name: gems
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ~>
68
+ - !ruby/object:Gem::Version
69
+ version: 0.8.3
70
+ type: :runtime
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ~>
76
+ - !ruby/object:Gem::Version
77
+ version: 0.8.3
78
+ - !ruby/object:Gem::Dependency
79
+ name: rake
80
+ requirement: !ruby/object:Gem::Requirement
81
+ none: false
82
+ requirements:
83
+ - - ! '>='
84
+ - !ruby/object:Gem::Version
85
+ version: '0'
86
+ type: :development
87
+ prerelease: false
88
+ version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ! '>='
92
+ - !ruby/object:Gem::Version
93
+ version: '0'
94
+ description: A very simple crawler for RubyGems.org used to demo the power of ElasticSearch
95
+ at RubyConf 2013
96
+ email:
97
+ - luca@gild.com
98
+ executables:
99
+ - rubygems-gems-crawler
100
+ - rubygems-web-crawler
101
+ extensions: []
102
+ extra_rdoc_files: []
103
+ files:
104
+ - lib/rubygems-crawler/gems_crawler.rb
105
+ - lib/rubygems-crawler/web_crawler.rb
106
+ - bin/rubygems-gems-crawler
107
+ - bin/rubygems-web-crawler
108
+ homepage: http://www.gild.com
109
+ licenses: []
110
+ post_install_message:
111
+ rdoc_options: []
112
+ require_paths:
113
+ - lib
114
+ required_ruby_version: !ruby/object:Gem::Requirement
115
+ none: false
116
+ requirements:
117
+ - - ! '>='
118
+ - !ruby/object:Gem::Version
119
+ version: '0'
120
+ segments:
121
+ - 0
122
+ hash: -4028937865614676331
123
+ required_rubygems_version: !ruby/object:Gem::Requirement
124
+ none: false
125
+ requirements:
126
+ - - ! '>='
127
+ - !ruby/object:Gem::Version
128
+ version: '0'
129
+ segments:
130
+ - 0
131
+ hash: -4028937865614676331
132
+ requirements: []
133
+ rubyforge_project:
134
+ rubygems_version: 1.8.24
135
+ signing_key:
136
+ specification_version: 3
137
+ summary: A very simple crawler for RubyGems.org
138
+ test_files: []