rubygems-crawler 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/rubygems-gems-crawler +13 -0
- data/bin/rubygems-web-crawler +13 -0
- data/lib/rubygems-crawler/gems_crawler.rb +42 -0
- data/lib/rubygems-crawler/web_crawler.rb +90 -0
- metadata +138 -0
@@ -0,0 +1,13 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# Define an env variable MONGO_URI like:
|
4
|
+
# MONGO_URI='mongodb://localhost:27017/'
|
5
|
+
|
6
|
+
require 'rubygems-crawler/gems_crawler'
|
7
|
+
|
8
|
+
mongo_uri = ENV['MONGO_URI'] || 'mongodb://localhost:27017/'
|
9
|
+
mongo_client = Mongo::MongoClient.from_uri(mongo_uri)
|
10
|
+
mongo_db = mongo_client.db('rubygems')
|
11
|
+
|
12
|
+
g = RubyGems::GemsCrawler.new mongo_db
|
13
|
+
g.crawl_from 'a'
|
@@ -0,0 +1,13 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# Define an env variable MONGO_URI like:
|
4
|
+
# MONGO_URI='mongodb://localhost:27017/'
|
5
|
+
|
6
|
+
require 'rubygems-crawler/web_crawler'
|
7
|
+
|
8
|
+
mongo_uri = ENV['MONGO_URI'] || 'mongodb://localhost:27017/'
|
9
|
+
mongo_client = Mongo::MongoClient.from_uri(mongo_uri)
|
10
|
+
mongo_db = mongo_client.db('rubygems')
|
11
|
+
|
12
|
+
w = RubyGems::WebCrawler.new mongo_db
|
13
|
+
('A'..'Z').each {|seed| w.crawl(seed) }
|
@@ -0,0 +1,42 @@
|
|
1
|
+
require 'gems'
|
2
|
+
require 'mongo'
|
3
|
+
|
4
|
+
# A very simple gems crawler for Rubygems.org
|
5
|
+
module RubyGems
|
6
|
+
class GemsCrawler
|
7
|
+
|
8
|
+
GRACE_PERIOD = 5 # be gentle
|
9
|
+
|
10
|
+
def initialize(mongo)
|
11
|
+
@mongo = mongo
|
12
|
+
end
|
13
|
+
|
14
|
+
def crawl_from(initial_name='a')
|
15
|
+
#name: {'$gte' => initial_name} - to filter by name
|
16
|
+
@mongo[:gems].find({owners: nil}, {fields: ["name"]}).each_slice(10) do |bulk|
|
17
|
+
bulk.each do |mongo_doc|
|
18
|
+
crawl(mongo_doc['name'])
|
19
|
+
sleep GRACE_PERIOD #be nice
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def crawl(gem_name)
|
25
|
+
STDOUT.puts "[RubyGems Web Crawler] Acquiring data for gem #{gem_name}"
|
26
|
+
|
27
|
+
gem_object = Gems.info(gem_name)
|
28
|
+
gem_object['versions'] = Gems.versions(gem_name)
|
29
|
+
gem_object['owners'] = Gems.owners(gem_name)
|
30
|
+
|
31
|
+
save(gem_object)
|
32
|
+
rescue
|
33
|
+
STDERR.puts "[RubyGems Web Crawler] Error while acquiring data for gem #{gem_name}"
|
34
|
+
end
|
35
|
+
|
36
|
+
# Save all the gem data into Mongo
|
37
|
+
def save(gem_object)
|
38
|
+
@mongo[:gems].find_and_modify(query: {name: gem_object['name']}, update: gem_object)
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,90 @@
|
|
1
|
+
require 'net/http'
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'mongo'
|
4
|
+
|
5
|
+
# A very simple web crawler for Rubygems.org
|
6
|
+
module RubyGems
|
7
|
+
class WebCrawler
|
8
|
+
|
9
|
+
BASE_URL = 'http://rubygems.org'
|
10
|
+
REQUEST_HEADERS = {'User-Agent'=>'rubygems-crawler'}
|
11
|
+
TIMEOUT=30
|
12
|
+
GRACE_PERIOD=1 #Sleep for a while - be gentle
|
13
|
+
|
14
|
+
def initialize(mongo)
|
15
|
+
@mongo = mongo
|
16
|
+
end
|
17
|
+
|
18
|
+
# Crawl all the pages of RubyGems, given an initial letter and save the data into MongoDB
|
19
|
+
def crawl(letter='A')
|
20
|
+
url = "#{BASE_URL}/gems?letter=#{letter}"
|
21
|
+
while url && gems = download_page(url)
|
22
|
+
save_gems(gems[:gems])
|
23
|
+
STDOUT.puts "[RubyGems Web Crawler] [#{url}] - Acquired #{gems[:gems].count} gems"
|
24
|
+
|
25
|
+
url = (gems[:next_path]) ? "#{BASE_URL}#{gems[:next_path]}" : nil
|
26
|
+
sleep GRACE_PERIOD
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
# Download an HTML page given an url, parse the HTML and convert the result back into an HASH
|
31
|
+
def download_page(url)
|
32
|
+
STDOUT.puts "Acquiring #{url}"
|
33
|
+
|
34
|
+
network_res = network_call(url, REQUEST_HEADERS, TIMEOUT)
|
35
|
+
return parse_content(network_res[:response]) if network_res && network_res[:response]
|
36
|
+
end
|
37
|
+
|
38
|
+
#Execute a GET HTTP call to url given the specified headers
|
39
|
+
def network_call(url, request_headers={}, timeout = nil)
|
40
|
+
|
41
|
+
retries = 0
|
42
|
+
begin
|
43
|
+
uri = URI.parse(url.ascii_only? ? url : URI.escape(url))
|
44
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
45
|
+
|
46
|
+
unless timeout.nil?
|
47
|
+
http.open_timeout = timeout
|
48
|
+
http.read_timeout = timeout
|
49
|
+
end
|
50
|
+
|
51
|
+
request = Net::HTTP::Get.new(uri.request_uri, request_headers)
|
52
|
+
response = http.request(request)
|
53
|
+
|
54
|
+
rescue Timeout::Error, Net::HTTPBadResponse, EOFError => e
|
55
|
+
retries += 1
|
56
|
+
retry unless retries > 3
|
57
|
+
return {error: e, code: 0}
|
58
|
+
end
|
59
|
+
|
60
|
+
result = {:code=>response.code.to_i}
|
61
|
+
result[:response] = response.body if response.code.to_s == '200'
|
62
|
+
result
|
63
|
+
end
|
64
|
+
|
65
|
+
# Parse the HTML of the page extracting gem names and total number of pages
|
66
|
+
def parse_content(response)
|
67
|
+
gem_res = {:gems => [], :next_path => nil}
|
68
|
+
|
69
|
+
html_doc = Nokogiri::HTML(response)
|
70
|
+
|
71
|
+
html_doc.css('.gems li a>strong').each do |node|
|
72
|
+
node.content =~ /(.+)\s\(.+\)/
|
73
|
+
gem_res[:gems] << $1
|
74
|
+
end
|
75
|
+
|
76
|
+
next_page = html_doc.css('.next_page').first
|
77
|
+
if next_page
|
78
|
+
gem_res[:next_path] = next_page.attr('href')
|
79
|
+
end
|
80
|
+
|
81
|
+
gem_res
|
82
|
+
end
|
83
|
+
|
84
|
+
# Save all the gem names into Mongo
|
85
|
+
def save_gems(gems)
|
86
|
+
gems.each {|gem_name| @mongo[:gems].insert({name: gem_name}) }
|
87
|
+
end
|
88
|
+
|
89
|
+
end
|
90
|
+
end
|
metadata
ADDED
@@ -0,0 +1,138 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: rubygems-crawler
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Luca Bonmassar
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-10-28 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: nokogiri
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: 1.5.5
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: 1.5.5
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: mongo
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ~>
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: 1.8.0
|
38
|
+
type: :runtime
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ~>
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: 1.8.0
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: bson_ext
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ~>
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: 1.8.0
|
54
|
+
type: :runtime
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ~>
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: 1.8.0
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: gems
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ~>
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: 0.8.3
|
70
|
+
type: :runtime
|
71
|
+
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ~>
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: 0.8.3
|
78
|
+
- !ruby/object:Gem::Dependency
|
79
|
+
name: rake
|
80
|
+
requirement: !ruby/object:Gem::Requirement
|
81
|
+
none: false
|
82
|
+
requirements:
|
83
|
+
- - ! '>='
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
version: '0'
|
86
|
+
type: :development
|
87
|
+
prerelease: false
|
88
|
+
version_requirements: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
90
|
+
requirements:
|
91
|
+
- - ! '>='
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: '0'
|
94
|
+
description: A very simple crawler for RubyGems.org used to demo the power of ElasticSearch
|
95
|
+
at RubyConf 2013
|
96
|
+
email:
|
97
|
+
- luca@gild.com
|
98
|
+
executables:
|
99
|
+
- rubygems-gems-crawler
|
100
|
+
- rubygems-web-crawler
|
101
|
+
extensions: []
|
102
|
+
extra_rdoc_files: []
|
103
|
+
files:
|
104
|
+
- lib/rubygems-crawler/gems_crawler.rb
|
105
|
+
- lib/rubygems-crawler/web_crawler.rb
|
106
|
+
- bin/rubygems-gems-crawler
|
107
|
+
- bin/rubygems-web-crawler
|
108
|
+
homepage: http://www.gild.com
|
109
|
+
licenses: []
|
110
|
+
post_install_message:
|
111
|
+
rdoc_options: []
|
112
|
+
require_paths:
|
113
|
+
- lib
|
114
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
115
|
+
none: false
|
116
|
+
requirements:
|
117
|
+
- - ! '>='
|
118
|
+
- !ruby/object:Gem::Version
|
119
|
+
version: '0'
|
120
|
+
segments:
|
121
|
+
- 0
|
122
|
+
hash: -4028937865614676331
|
123
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
124
|
+
none: false
|
125
|
+
requirements:
|
126
|
+
- - ! '>='
|
127
|
+
- !ruby/object:Gem::Version
|
128
|
+
version: '0'
|
129
|
+
segments:
|
130
|
+
- 0
|
131
|
+
hash: -4028937865614676331
|
132
|
+
requirements: []
|
133
|
+
rubyforge_project:
|
134
|
+
rubygems_version: 1.8.24
|
135
|
+
signing_key:
|
136
|
+
specification_version: 3
|
137
|
+
summary: A very simple crawler for RubyGems.org
|
138
|
+
test_files: []
|