collamine 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/collamine.gemspec +0 -2
- data/lib/collamine.rb +11 -23
- data/lib/collamine/request.rb +9 -5
- data/lib/collamine/version.rb +1 -1
- data/test.rb +32 -4
- metadata +3 -31
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4082cf4e4a34cca97098611238f3ad46cf6e74d0
|
4
|
+
data.tar.gz: 7c5302924a2c2adfcd2f6a077d93294ff073ee8c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 56faf9049d65e2c9cf092c1e275820b237abb3563088e0e658476895183ae22124e6c0c2c6e528cafa649e216f1f15bb3a55e32ec50a0202cbd7dd2554f72ee7
|
7
|
+
data.tar.gz: 358cc2a90c568d109926ca8a6ebae9e2ad462d1819142cc10614471565ed1e5e87e72d92c456c9175b3000533d7cf575f08a338692a62d08519abe1723df7213
|
data/collamine.gemspec
CHANGED
data/lib/collamine.rb
CHANGED
@@ -1,19 +1,14 @@
|
|
1
1
|
require 'collamine/request'
|
2
2
|
require 'spidercrawl'
|
3
3
|
|
4
|
-
require 'curb'
|
5
|
-
require 'domainatrix'
|
6
|
-
require 'mongo'
|
7
|
-
|
8
4
|
class Collamine
|
9
|
-
include Mongo
|
10
|
-
STORE = MongoClient.new("localhost", 27017).db("smartcache").collection("html")
|
11
|
-
|
12
5
|
def self.start(url, options)
|
6
|
+
Request.setup_collamine(download: options[:download], upload: options[:upload])
|
13
7
|
from_collamine = []
|
14
8
|
pages = Spiderman.shoot(url, options) do |web|
|
15
9
|
collamine = nil
|
16
10
|
web.before_fetch do |url|
|
11
|
+
@setup.yield url unless @setup.nil?
|
17
12
|
# Try to fetch from collamine server
|
18
13
|
puts "trying collamine: #{url}"
|
19
14
|
puts "fetched from collamine: #{url}" if (collamine = Request.try_collamine(url))
|
@@ -36,25 +31,18 @@ class Collamine
|
|
36
31
|
filename += '.html' unless filename.include?('.html')
|
37
32
|
Request.upload_to_collamine(page.url, page.content, filename, page.crawled_time.to_i)
|
38
33
|
end
|
39
|
-
# Check if duplicate
|
40
|
-
unless STORE.find("url" => page.url).to_a.size > 0
|
41
|
-
# Insert into Mongodb
|
42
|
-
puts "Insert to db: #{page.url}"
|
43
|
-
source = (from_collamine.include?(page.url) ? 'collamine' : 'original')
|
44
|
-
doc = {:url => page.url,
|
45
|
-
:domain => Domainatrix.parse(page.url).domain,
|
46
|
-
:source => source,
|
47
|
-
:content => page.content.encode('UTF-8', 'ISO-8859-15'),
|
48
|
-
:crawled_date => page.crawled_time.to_i,
|
49
|
-
:response_time => page.response_time.to_i
|
50
|
-
}
|
51
|
-
STORE.insert(doc)
|
52
|
-
else
|
53
|
-
puts "url exists"
|
54
|
-
end
|
55
34
|
end
|
35
|
+
@teardown.yield page, from_collamine unless @teardown.nil?
|
56
36
|
end
|
57
37
|
end
|
58
38
|
return pages, from_collamine
|
59
39
|
end
|
40
|
+
|
41
|
+
def self.before_fetch(&block)
|
42
|
+
@setup = block if block
|
43
|
+
end
|
44
|
+
|
45
|
+
def self.after_fetch(&block)
|
46
|
+
@teardown = block if block
|
47
|
+
end
|
60
48
|
end
|
data/lib/collamine/request.rb
CHANGED
@@ -4,13 +4,17 @@ require 'domainatrix'
|
|
4
4
|
|
5
5
|
# Makes the request to CollaMine servers
|
6
6
|
class Request
|
7
|
-
COLLAMINE_DOWNLOAD_URL = 'http://172.31.22.135:9001/download/html/'
|
8
|
-
COLLAMINE_UPLOAD_URL = 'http://172.31.22.135:9001/upload/html/multipart/'
|
7
|
+
#COLLAMINE_DOWNLOAD_URL = 'http://172.31.22.135:9001/download/html/'
|
8
|
+
#COLLAMINE_UPLOAD_URL = 'http://172.31.22.135:9001/upload/html/multipart/'
|
9
|
+
def self.setup_collamine(options)
|
10
|
+
@collamine_download_url = options[:download]
|
11
|
+
@collamine_upload_url = options[:upload]
|
12
|
+
end
|
9
13
|
#
|
10
14
|
# Try downloading the content from CollaMine servers
|
11
15
|
#
|
12
16
|
def self.try_collamine(url)
|
13
|
-
uri = URI.parse(
|
17
|
+
uri = URI.parse(@collamine_download_url+CGI::escape(url.to_s))
|
14
18
|
Net::HTTP.start(uri.host, uri.port) do |http|
|
15
19
|
response = http.get(uri)
|
16
20
|
case response
|
@@ -25,13 +29,13 @@ class Request
|
|
25
29
|
# Upload the content to Collamine servers
|
26
30
|
#
|
27
31
|
def self.upload_to_collamine(url, content, filename, crawltime)
|
28
|
-
post_request = Net::HTTP::Post::Multipart.new
|
32
|
+
post_request = Net::HTTP::Post::Multipart.new @collamine_upload_url,
|
29
33
|
'domain' => Domainatrix.parse(url).domain,
|
30
34
|
'url' => url,
|
31
35
|
'crawltime' => crawltime,
|
32
36
|
'contributor' => 'belson',
|
33
37
|
'document' => UploadIO.new(StringIO.new(content.encode('UTF-8', 'ISO-8859-15')), 'text/html', filename)
|
34
|
-
response = Net::HTTP.start(URI.parse(
|
38
|
+
response = Net::HTTP.start(URI.parse(@collamine_upload_url).host, URI.parse(@collamine_upload_url).port) { |http| http.request(post_request) }
|
35
39
|
puts response.body
|
36
40
|
end
|
37
41
|
end
|
data/lib/collamine/version.rb
CHANGED
data/test.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'collamine'
|
2
|
-
|
2
|
+
require 'mongo'
|
3
|
+
require 'domainatrix'
|
3
4
|
#http://forums.hardwarezone.com.sg/hwm-magazine-publication-38/
|
4
5
|
#http://forums.hardwarezone.com.sg/money-mind-210/
|
5
6
|
#http://sgforums.com/forums/4
|
@@ -13,10 +14,37 @@ require 'collamine'
|
|
13
14
|
#https://www.apple.com/sg/
|
14
15
|
#http://forums.hardwarezone.com.sg/current-affairs-lounge-17/
|
15
16
|
|
17
|
+
include Mongo
|
18
|
+
STORE = MongoClient.new("localhost", 27017).db("smartcache").collection("html")
|
19
|
+
|
20
|
+
Collamine.before_fetch do |url|
|
21
|
+
puts "Do what you want to the url: #{url}"
|
22
|
+
end
|
23
|
+
|
24
|
+
Collamine.after_fetch do |page, from_collamine|
|
25
|
+
# Check if duplicate
|
26
|
+
unless STORE.find("url" => page.url).to_a.size > 0
|
27
|
+
# Insert into Mongodb
|
28
|
+
puts "Insert to db: #{page.url}"
|
29
|
+
source = (from_collamine.include?(page.url) ? 'collamine' : 'original')
|
30
|
+
doc = {:url => page.url,
|
31
|
+
:domain => Domainatrix.parse(page.url).domain,
|
32
|
+
:source => source,
|
33
|
+
:content => page.content.encode('UTF-8', 'ISO-8859-15'),
|
34
|
+
:crawled_date => page.crawled_time.to_i,
|
35
|
+
:response_time => page.response_time.to_i
|
36
|
+
}
|
37
|
+
STORE.insert(doc)
|
38
|
+
else
|
39
|
+
puts "url exists"
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
16
43
|
pages, from_collamine = Collamine.start('http://forums.hardwarezone.com.sg/money-mind-210/',
|
17
|
-
|
18
|
-
|
19
|
-
|
44
|
+
:pattern => Regexp.new('^http:\/\/forums\.hardwarezone\.com\.sg\/money-mind-210\/?(.*\.html)?$'),
|
45
|
+
:download => 'http://172.20.131.150:9001/download/html/',
|
46
|
+
:upload => 'http://172.20.131.150:9001/upload/html/multipart/',
|
47
|
+
:parallel => true, :threads => 10)
|
20
48
|
|
21
49
|
puts "Total pages crawled: #{pages.size}"
|
22
50
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: collamine
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Belson Heng
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-03-
|
11
|
+
date: 2016-03-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -80,34 +80,6 @@ dependencies:
|
|
80
80
|
- - ">="
|
81
81
|
- !ruby/object:Gem::Version
|
82
82
|
version: '0'
|
83
|
-
- !ruby/object:Gem::Dependency
|
84
|
-
name: mongo
|
85
|
-
requirement: !ruby/object:Gem::Requirement
|
86
|
-
requirements:
|
87
|
-
- - ">="
|
88
|
-
- !ruby/object:Gem::Version
|
89
|
-
version: '0'
|
90
|
-
type: :runtime
|
91
|
-
prerelease: false
|
92
|
-
version_requirements: !ruby/object:Gem::Requirement
|
93
|
-
requirements:
|
94
|
-
- - ">="
|
95
|
-
- !ruby/object:Gem::Version
|
96
|
-
version: '0'
|
97
|
-
- !ruby/object:Gem::Dependency
|
98
|
-
name: bson_ext
|
99
|
-
requirement: !ruby/object:Gem::Requirement
|
100
|
-
requirements:
|
101
|
-
- - ">="
|
102
|
-
- !ruby/object:Gem::Version
|
103
|
-
version: '0'
|
104
|
-
type: :runtime
|
105
|
-
prerelease: false
|
106
|
-
version_requirements: !ruby/object:Gem::Requirement
|
107
|
-
requirements:
|
108
|
-
- - ">="
|
109
|
-
- !ruby/object:Gem::Version
|
110
|
-
version: '0'
|
111
83
|
description: Collamine is a ruby gem for CollaMine client, which communicates with
|
112
84
|
CollaMine servers to download content from their SmartCache if it exists.
|
113
85
|
email:
|
@@ -146,7 +118,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
146
118
|
version: '0'
|
147
119
|
requirements: []
|
148
120
|
rubyforge_project:
|
149
|
-
rubygems_version: 2.
|
121
|
+
rubygems_version: 2.2.2
|
150
122
|
signing_key:
|
151
123
|
specification_version: 4
|
152
124
|
summary: Collamine lets you crawl a web site using SpiderCrawl library and share the
|