collamine 0.2.1 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 0ff9befa1cf2bd9dfe6fdae08b111d4555fca9a3
4
- data.tar.gz: bf78076f8b51c02eb3172ff9a5c1ff98a03fd95b
3
+ metadata.gz: 4082cf4e4a34cca97098611238f3ad46cf6e74d0
4
+ data.tar.gz: 7c5302924a2c2adfcd2f6a077d93294ff073ee8c
5
5
  SHA512:
6
- metadata.gz: 8a048c8f364ecbf1d5f768ffd6e439f962882a5ed128b2d72b560adee8357d753b64e52ec97f77d98f06fd19c82791c967637af4802eab72e0bfb15b109291c2
7
- data.tar.gz: 2b15cbef9354411d0b18aa24a2317f58128a1e873b2a0891d60daacbe8a743719017cc6ad2a78215de17f5211719339a83708c35c34c7cc95f46844acfdfa310
6
+ metadata.gz: 56faf9049d65e2c9cf092c1e275820b237abb3563088e0e658476895183ae22124e6c0c2c6e528cafa649e216f1f15bb3a55e32ec50a0202cbd7dd2554f72ee7
7
+ data.tar.gz: 358cc2a90c568d109926ca8a6ebae9e2ad462d1819142cc10614471565ed1e5e87e72d92c456c9175b3000533d7cf575f08a338692a62d08519abe1723df7213
@@ -24,6 +24,4 @@ Gem::Specification.new do |spec|
24
24
  spec.add_dependency "spidercrawl"
25
25
  spec.add_dependency "multipart-post"
26
26
  spec.add_dependency "domainatrix"
27
- spec.add_dependency "mongo"
28
- spec.add_dependency "bson_ext"
29
27
  end
@@ -1,19 +1,14 @@
1
1
  require 'collamine/request'
2
2
  require 'spidercrawl'
3
3
 
4
- require 'curb'
5
- require 'domainatrix'
6
- require 'mongo'
7
-
8
4
  class Collamine
9
- include Mongo
10
- STORE = MongoClient.new("localhost", 27017).db("smartcache").collection("html")
11
-
12
5
  def self.start(url, options)
6
+ Request.setup_collamine(download: options[:download], upload: options[:upload])
13
7
  from_collamine = []
14
8
  pages = Spiderman.shoot(url, options) do |web|
15
9
  collamine = nil
16
10
  web.before_fetch do |url|
11
+ @setup.yield url unless @setup.nil?
17
12
  # Try to fetch from collamine server
18
13
  puts "trying collamine: #{url}"
19
14
  puts "fetched from collamine: #{url}" if (collamine = Request.try_collamine(url))
@@ -36,25 +31,18 @@ class Collamine
36
31
  filename += '.html' unless filename.include?('.html')
37
32
  Request.upload_to_collamine(page.url, page.content, filename, page.crawled_time.to_i)
38
33
  end
39
- # Check if duplicate
40
- unless STORE.find("url" => page.url).to_a.size > 0
41
- # Insert into Mongodb
42
- puts "Insert to db: #{page.url}"
43
- source = (from_collamine.include?(page.url) ? 'collamine' : 'original')
44
- doc = {:url => page.url,
45
- :domain => Domainatrix.parse(page.url).domain,
46
- :source => source,
47
- :content => page.content.encode('UTF-8', 'ISO-8859-15'),
48
- :crawled_date => page.crawled_time.to_i,
49
- :response_time => page.response_time.to_i
50
- }
51
- STORE.insert(doc)
52
- else
53
- puts "url exists"
54
- end
55
34
  end
35
+ @teardown.yield page, from_collamine unless @teardown.nil?
56
36
  end
57
37
  end
58
38
  return pages, from_collamine
59
39
  end
40
+
41
+ def self.before_fetch(&block)
42
+ @setup = block if block
43
+ end
44
+
45
+ def self.after_fetch(&block)
46
+ @teardown = block if block
47
+ end
60
48
  end
@@ -4,13 +4,17 @@ require 'domainatrix'
4
4
 
5
5
  # Makes the request to CollaMine servers
6
6
  class Request
7
- COLLAMINE_DOWNLOAD_URL = 'http://172.31.22.135:9001/download/html/'
8
- COLLAMINE_UPLOAD_URL = 'http://172.31.22.135:9001/upload/html/multipart/'
7
+ #COLLAMINE_DOWNLOAD_URL = 'http://172.31.22.135:9001/download/html/'
8
+ #COLLAMINE_UPLOAD_URL = 'http://172.31.22.135:9001/upload/html/multipart/'
9
+ def self.setup_collamine(options)
10
+ @collamine_download_url = options[:download]
11
+ @collamine_upload_url = options[:upload]
12
+ end
9
13
  #
10
14
  # Try downloading the content from CollaMine servers
11
15
  #
12
16
  def self.try_collamine(url)
13
- uri = URI.parse(COLLAMINE_DOWNLOAD_URL+CGI::escape(url.to_s))
17
+ uri = URI.parse(@collamine_download_url+CGI::escape(url.to_s))
14
18
  Net::HTTP.start(uri.host, uri.port) do |http|
15
19
  response = http.get(uri)
16
20
  case response
@@ -25,13 +29,13 @@ class Request
25
29
  # Upload the content to Collamine servers
26
30
  #
27
31
  def self.upload_to_collamine(url, content, filename, crawltime)
28
- post_request = Net::HTTP::Post::Multipart.new COLLAMINE_UPLOAD_URL,
32
+ post_request = Net::HTTP::Post::Multipart.new @collamine_upload_url,
29
33
  'domain' => Domainatrix.parse(url).domain,
30
34
  'url' => url,
31
35
  'crawltime' => crawltime,
32
36
  'contributor' => 'belson',
33
37
  'document' => UploadIO.new(StringIO.new(content.encode('UTF-8', 'ISO-8859-15')), 'text/html', filename)
34
- response = Net::HTTP.start(URI.parse(COLLAMINE_UPLOAD_URL).host, URI.parse(COLLAMINE_UPLOAD_URL).port) { |http| http.request(post_request) }
38
+ response = Net::HTTP.start(URI.parse(@collamine_upload_url).host, URI.parse(@collamine_upload_url).port) { |http| http.request(post_request) }
35
39
  puts response.body
36
40
  end
37
41
  end
@@ -1,3 +1,3 @@
1
1
  module Collamine
2
- VERSION = "0.2.1"
2
+ VERSION = "0.3.0"
3
3
  end
data/test.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  require 'collamine'
2
-
2
+ require 'mongo'
3
+ require 'domainatrix'
3
4
  #http://forums.hardwarezone.com.sg/hwm-magazine-publication-38/
4
5
  #http://forums.hardwarezone.com.sg/money-mind-210/
5
6
  #http://sgforums.com/forums/4
@@ -13,10 +14,37 @@ require 'collamine'
13
14
  #https://www.apple.com/sg/
14
15
  #http://forums.hardwarezone.com.sg/current-affairs-lounge-17/
15
16
 
17
+ include Mongo
18
+ STORE = MongoClient.new("localhost", 27017).db("smartcache").collection("html")
19
+
20
+ Collamine.before_fetch do |url|
21
+ puts "Do what you want to the url: #{url}"
22
+ end
23
+
24
+ Collamine.after_fetch do |page, from_collamine|
25
+ # Check if duplicate
26
+ unless STORE.find("url" => page.url).to_a.size > 0
27
+ # Insert into Mongodb
28
+ puts "Insert to db: #{page.url}"
29
+ source = (from_collamine.include?(page.url) ? 'collamine' : 'original')
30
+ doc = {:url => page.url,
31
+ :domain => Domainatrix.parse(page.url).domain,
32
+ :source => source,
33
+ :content => page.content.encode('UTF-8', 'ISO-8859-15'),
34
+ :crawled_date => page.crawled_time.to_i,
35
+ :response_time => page.response_time.to_i
36
+ }
37
+ STORE.insert(doc)
38
+ else
39
+ puts "url exists"
40
+ end
41
+ end
42
+
16
43
  pages, from_collamine = Collamine.start('http://forums.hardwarezone.com.sg/money-mind-210/',
17
- :parallel => true,
18
- :threads => 10,
19
- :pattern => Regexp.new('^http:\/\/forums\.hardwarezone\.com\.sg\/money-mind-210\/?(.*\.html)?$'))
44
+ :pattern => Regexp.new('^http:\/\/forums\.hardwarezone\.com\.sg\/money-mind-210\/?(.*\.html)?$'),
45
+ :download => 'http://172.20.131.150:9001/download/html/',
46
+ :upload => 'http://172.20.131.150:9001/upload/html/multipart/',
47
+ :parallel => true, :threads => 10)
20
48
 
21
49
  puts "Total pages crawled: #{pages.size}"
22
50
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: collamine
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Belson Heng
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-03-10 00:00:00.000000000 Z
11
+ date: 2016-03-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -80,34 +80,6 @@ dependencies:
80
80
  - - ">="
81
81
  - !ruby/object:Gem::Version
82
82
  version: '0'
83
- - !ruby/object:Gem::Dependency
84
- name: mongo
85
- requirement: !ruby/object:Gem::Requirement
86
- requirements:
87
- - - ">="
88
- - !ruby/object:Gem::Version
89
- version: '0'
90
- type: :runtime
91
- prerelease: false
92
- version_requirements: !ruby/object:Gem::Requirement
93
- requirements:
94
- - - ">="
95
- - !ruby/object:Gem::Version
96
- version: '0'
97
- - !ruby/object:Gem::Dependency
98
- name: bson_ext
99
- requirement: !ruby/object:Gem::Requirement
100
- requirements:
101
- - - ">="
102
- - !ruby/object:Gem::Version
103
- version: '0'
104
- type: :runtime
105
- prerelease: false
106
- version_requirements: !ruby/object:Gem::Requirement
107
- requirements:
108
- - - ">="
109
- - !ruby/object:Gem::Version
110
- version: '0'
111
83
  description: Collamine is a ruby gem for CollaMine client, which communicates with
112
84
  CollaMine servers to download content from their SmartCache if it exists.
113
85
  email:
@@ -146,7 +118,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
146
118
  version: '0'
147
119
  requirements: []
148
120
  rubyforge_project:
149
- rubygems_version: 2.4.5
121
+ rubygems_version: 2.2.2
150
122
  signing_key:
151
123
  specification_version: 4
152
124
  summary: Collamine lets you crawl a web site using SpiderCrawl library and share the