collamine 0.2.1 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/collamine.gemspec +0 -2
- data/lib/collamine.rb +11 -23
- data/lib/collamine/request.rb +9 -5
- data/lib/collamine/version.rb +1 -1
- data/test.rb +32 -4
- metadata +3 -31
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4082cf4e4a34cca97098611238f3ad46cf6e74d0
|
4
|
+
data.tar.gz: 7c5302924a2c2adfcd2f6a077d93294ff073ee8c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 56faf9049d65e2c9cf092c1e275820b237abb3563088e0e658476895183ae22124e6c0c2c6e528cafa649e216f1f15bb3a55e32ec50a0202cbd7dd2554f72ee7
|
7
|
+
data.tar.gz: 358cc2a90c568d109926ca8a6ebae9e2ad462d1819142cc10614471565ed1e5e87e72d92c456c9175b3000533d7cf575f08a338692a62d08519abe1723df7213
|
data/collamine.gemspec
CHANGED
data/lib/collamine.rb
CHANGED
@@ -1,19 +1,14 @@
|
|
1
1
|
require 'collamine/request'
|
2
2
|
require 'spidercrawl'
|
3
3
|
|
4
|
-
require 'curb'
|
5
|
-
require 'domainatrix'
|
6
|
-
require 'mongo'
|
7
|
-
|
8
4
|
class Collamine
|
9
|
-
include Mongo
|
10
|
-
STORE = MongoClient.new("localhost", 27017).db("smartcache").collection("html")
|
11
|
-
|
12
5
|
def self.start(url, options)
|
6
|
+
Request.setup_collamine(download: options[:download], upload: options[:upload])
|
13
7
|
from_collamine = []
|
14
8
|
pages = Spiderman.shoot(url, options) do |web|
|
15
9
|
collamine = nil
|
16
10
|
web.before_fetch do |url|
|
11
|
+
@setup.yield url unless @setup.nil?
|
17
12
|
# Try to fetch from collamine server
|
18
13
|
puts "trying collamine: #{url}"
|
19
14
|
puts "fetched from collamine: #{url}" if (collamine = Request.try_collamine(url))
|
@@ -36,25 +31,18 @@ class Collamine
|
|
36
31
|
filename += '.html' unless filename.include?('.html')
|
37
32
|
Request.upload_to_collamine(page.url, page.content, filename, page.crawled_time.to_i)
|
38
33
|
end
|
39
|
-
# Check if duplicate
|
40
|
-
unless STORE.find("url" => page.url).to_a.size > 0
|
41
|
-
# Insert into Mongodb
|
42
|
-
puts "Insert to db: #{page.url}"
|
43
|
-
source = (from_collamine.include?(page.url) ? 'collamine' : 'original')
|
44
|
-
doc = {:url => page.url,
|
45
|
-
:domain => Domainatrix.parse(page.url).domain,
|
46
|
-
:source => source,
|
47
|
-
:content => page.content.encode('UTF-8', 'ISO-8859-15'),
|
48
|
-
:crawled_date => page.crawled_time.to_i,
|
49
|
-
:response_time => page.response_time.to_i
|
50
|
-
}
|
51
|
-
STORE.insert(doc)
|
52
|
-
else
|
53
|
-
puts "url exists"
|
54
|
-
end
|
55
34
|
end
|
35
|
+
@teardown.yield page, from_collamine unless @teardown.nil?
|
56
36
|
end
|
57
37
|
end
|
58
38
|
return pages, from_collamine
|
59
39
|
end
|
40
|
+
|
41
|
+
def self.before_fetch(&block)
|
42
|
+
@setup = block if block
|
43
|
+
end
|
44
|
+
|
45
|
+
def self.after_fetch(&block)
|
46
|
+
@teardown = block if block
|
47
|
+
end
|
60
48
|
end
|
data/lib/collamine/request.rb
CHANGED
@@ -4,13 +4,17 @@ require 'domainatrix'
|
|
4
4
|
|
5
5
|
# Makes the request to CollaMine servers
|
6
6
|
class Request
|
7
|
-
COLLAMINE_DOWNLOAD_URL = 'http://172.31.22.135:9001/download/html/'
|
8
|
-
COLLAMINE_UPLOAD_URL = 'http://172.31.22.135:9001/upload/html/multipart/'
|
7
|
+
#COLLAMINE_DOWNLOAD_URL = 'http://172.31.22.135:9001/download/html/'
|
8
|
+
#COLLAMINE_UPLOAD_URL = 'http://172.31.22.135:9001/upload/html/multipart/'
|
9
|
+
def self.setup_collamine(options)
|
10
|
+
@collamine_download_url = options[:download]
|
11
|
+
@collamine_upload_url = options[:upload]
|
12
|
+
end
|
9
13
|
#
|
10
14
|
# Try downloading the content from CollaMine servers
|
11
15
|
#
|
12
16
|
def self.try_collamine(url)
|
13
|
-
uri = URI.parse(
|
17
|
+
uri = URI.parse(@collamine_download_url+CGI::escape(url.to_s))
|
14
18
|
Net::HTTP.start(uri.host, uri.port) do |http|
|
15
19
|
response = http.get(uri)
|
16
20
|
case response
|
@@ -25,13 +29,13 @@ class Request
|
|
25
29
|
# Upload the content to Collamine servers
|
26
30
|
#
|
27
31
|
def self.upload_to_collamine(url, content, filename, crawltime)
|
28
|
-
post_request = Net::HTTP::Post::Multipart.new
|
32
|
+
post_request = Net::HTTP::Post::Multipart.new @collamine_upload_url,
|
29
33
|
'domain' => Domainatrix.parse(url).domain,
|
30
34
|
'url' => url,
|
31
35
|
'crawltime' => crawltime,
|
32
36
|
'contributor' => 'belson',
|
33
37
|
'document' => UploadIO.new(StringIO.new(content.encode('UTF-8', 'ISO-8859-15')), 'text/html', filename)
|
34
|
-
response = Net::HTTP.start(URI.parse(
|
38
|
+
response = Net::HTTP.start(URI.parse(@collamine_upload_url).host, URI.parse(@collamine_upload_url).port) { |http| http.request(post_request) }
|
35
39
|
puts response.body
|
36
40
|
end
|
37
41
|
end
|
data/lib/collamine/version.rb
CHANGED
data/test.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'collamine'
|
2
|
-
|
2
|
+
require 'mongo'
|
3
|
+
require 'domainatrix'
|
3
4
|
#http://forums.hardwarezone.com.sg/hwm-magazine-publication-38/
|
4
5
|
#http://forums.hardwarezone.com.sg/money-mind-210/
|
5
6
|
#http://sgforums.com/forums/4
|
@@ -13,10 +14,37 @@ require 'collamine'
|
|
13
14
|
#https://www.apple.com/sg/
|
14
15
|
#http://forums.hardwarezone.com.sg/current-affairs-lounge-17/
|
15
16
|
|
17
|
+
include Mongo
|
18
|
+
STORE = MongoClient.new("localhost", 27017).db("smartcache").collection("html")
|
19
|
+
|
20
|
+
Collamine.before_fetch do |url|
|
21
|
+
puts "Do what you want to the url: #{url}"
|
22
|
+
end
|
23
|
+
|
24
|
+
Collamine.after_fetch do |page, from_collamine|
|
25
|
+
# Check if duplicate
|
26
|
+
unless STORE.find("url" => page.url).to_a.size > 0
|
27
|
+
# Insert into Mongodb
|
28
|
+
puts "Insert to db: #{page.url}"
|
29
|
+
source = (from_collamine.include?(page.url) ? 'collamine' : 'original')
|
30
|
+
doc = {:url => page.url,
|
31
|
+
:domain => Domainatrix.parse(page.url).domain,
|
32
|
+
:source => source,
|
33
|
+
:content => page.content.encode('UTF-8', 'ISO-8859-15'),
|
34
|
+
:crawled_date => page.crawled_time.to_i,
|
35
|
+
:response_time => page.response_time.to_i
|
36
|
+
}
|
37
|
+
STORE.insert(doc)
|
38
|
+
else
|
39
|
+
puts "url exists"
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
16
43
|
pages, from_collamine = Collamine.start('http://forums.hardwarezone.com.sg/money-mind-210/',
|
17
|
-
|
18
|
-
|
19
|
-
|
44
|
+
:pattern => Regexp.new('^http:\/\/forums\.hardwarezone\.com\.sg\/money-mind-210\/?(.*\.html)?$'),
|
45
|
+
:download => 'http://172.20.131.150:9001/download/html/',
|
46
|
+
:upload => 'http://172.20.131.150:9001/upload/html/multipart/',
|
47
|
+
:parallel => true, :threads => 10)
|
20
48
|
|
21
49
|
puts "Total pages crawled: #{pages.size}"
|
22
50
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: collamine
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Belson Heng
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-03-
|
11
|
+
date: 2016-03-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -80,34 +80,6 @@ dependencies:
|
|
80
80
|
- - ">="
|
81
81
|
- !ruby/object:Gem::Version
|
82
82
|
version: '0'
|
83
|
-
- !ruby/object:Gem::Dependency
|
84
|
-
name: mongo
|
85
|
-
requirement: !ruby/object:Gem::Requirement
|
86
|
-
requirements:
|
87
|
-
- - ">="
|
88
|
-
- !ruby/object:Gem::Version
|
89
|
-
version: '0'
|
90
|
-
type: :runtime
|
91
|
-
prerelease: false
|
92
|
-
version_requirements: !ruby/object:Gem::Requirement
|
93
|
-
requirements:
|
94
|
-
- - ">="
|
95
|
-
- !ruby/object:Gem::Version
|
96
|
-
version: '0'
|
97
|
-
- !ruby/object:Gem::Dependency
|
98
|
-
name: bson_ext
|
99
|
-
requirement: !ruby/object:Gem::Requirement
|
100
|
-
requirements:
|
101
|
-
- - ">="
|
102
|
-
- !ruby/object:Gem::Version
|
103
|
-
version: '0'
|
104
|
-
type: :runtime
|
105
|
-
prerelease: false
|
106
|
-
version_requirements: !ruby/object:Gem::Requirement
|
107
|
-
requirements:
|
108
|
-
- - ">="
|
109
|
-
- !ruby/object:Gem::Version
|
110
|
-
version: '0'
|
111
83
|
description: Collamine is a ruby gem for CollaMine client, which communicates with
|
112
84
|
CollaMine servers to download content from their SmartCache if it exists.
|
113
85
|
email:
|
@@ -146,7 +118,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
146
118
|
version: '0'
|
147
119
|
requirements: []
|
148
120
|
rubyforge_project:
|
149
|
-
rubygems_version: 2.
|
121
|
+
rubygems_version: 2.2.2
|
150
122
|
signing_key:
|
151
123
|
specification_version: 4
|
152
124
|
summary: Collamine lets you crawl a web site using SpiderCrawl library and share the
|