collamine 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 0ff9befa1cf2bd9dfe6fdae08b111d4555fca9a3
4
- data.tar.gz: bf78076f8b51c02eb3172ff9a5c1ff98a03fd95b
3
+ metadata.gz: 4082cf4e4a34cca97098611238f3ad46cf6e74d0
4
+ data.tar.gz: 7c5302924a2c2adfcd2f6a077d93294ff073ee8c
5
5
  SHA512:
6
- metadata.gz: 8a048c8f364ecbf1d5f768ffd6e439f962882a5ed128b2d72b560adee8357d753b64e52ec97f77d98f06fd19c82791c967637af4802eab72e0bfb15b109291c2
7
- data.tar.gz: 2b15cbef9354411d0b18aa24a2317f58128a1e873b2a0891d60daacbe8a743719017cc6ad2a78215de17f5211719339a83708c35c34c7cc95f46844acfdfa310
6
+ metadata.gz: 56faf9049d65e2c9cf092c1e275820b237abb3563088e0e658476895183ae22124e6c0c2c6e528cafa649e216f1f15bb3a55e32ec50a0202cbd7dd2554f72ee7
7
+ data.tar.gz: 358cc2a90c568d109926ca8a6ebae9e2ad462d1819142cc10614471565ed1e5e87e72d92c456c9175b3000533d7cf575f08a338692a62d08519abe1723df7213
@@ -24,6 +24,4 @@ Gem::Specification.new do |spec|
24
24
  spec.add_dependency "spidercrawl"
25
25
  spec.add_dependency "multipart-post"
26
26
  spec.add_dependency "domainatrix"
27
- spec.add_dependency "mongo"
28
- spec.add_dependency "bson_ext"
29
27
  end
@@ -1,19 +1,14 @@
1
1
  require 'collamine/request'
2
2
  require 'spidercrawl'
3
3
 
4
- require 'curb'
5
- require 'domainatrix'
6
- require 'mongo'
7
-
8
4
  class Collamine
9
- include Mongo
10
- STORE = MongoClient.new("localhost", 27017).db("smartcache").collection("html")
11
-
12
5
  def self.start(url, options)
6
+ Request.setup_collamine(download: options[:download], upload: options[:upload])
13
7
  from_collamine = []
14
8
  pages = Spiderman.shoot(url, options) do |web|
15
9
  collamine = nil
16
10
  web.before_fetch do |url|
11
+ @setup.yield url unless @setup.nil?
17
12
  # Try to fetch from collamine server
18
13
  puts "trying collamine: #{url}"
19
14
  puts "fetched from collamine: #{url}" if (collamine = Request.try_collamine(url))
@@ -36,25 +31,18 @@ class Collamine
36
31
  filename += '.html' unless filename.include?('.html')
37
32
  Request.upload_to_collamine(page.url, page.content, filename, page.crawled_time.to_i)
38
33
  end
39
- # Check if duplicate
40
- unless STORE.find("url" => page.url).to_a.size > 0
41
- # Insert into Mongodb
42
- puts "Insert to db: #{page.url}"
43
- source = (from_collamine.include?(page.url) ? 'collamine' : 'original')
44
- doc = {:url => page.url,
45
- :domain => Domainatrix.parse(page.url).domain,
46
- :source => source,
47
- :content => page.content.encode('UTF-8', 'ISO-8859-15'),
48
- :crawled_date => page.crawled_time.to_i,
49
- :response_time => page.response_time.to_i
50
- }
51
- STORE.insert(doc)
52
- else
53
- puts "url exists"
54
- end
55
34
  end
35
+ @teardown.yield page, from_collamine unless @teardown.nil?
56
36
  end
57
37
  end
58
38
  return pages, from_collamine
59
39
  end
40
+
41
+ def self.before_fetch(&block)
42
+ @setup = block if block
43
+ end
44
+
45
+ def self.after_fetch(&block)
46
+ @teardown = block if block
47
+ end
60
48
  end
@@ -4,13 +4,17 @@ require 'domainatrix'
4
4
 
5
5
  # Makes the request to CollaMine servers
6
6
  class Request
7
- COLLAMINE_DOWNLOAD_URL = 'http://172.31.22.135:9001/download/html/'
8
- COLLAMINE_UPLOAD_URL = 'http://172.31.22.135:9001/upload/html/multipart/'
7
+ #COLLAMINE_DOWNLOAD_URL = 'http://172.31.22.135:9001/download/html/'
8
+ #COLLAMINE_UPLOAD_URL = 'http://172.31.22.135:9001/upload/html/multipart/'
9
+ def self.setup_collamine(options)
10
+ @collamine_download_url = options[:download]
11
+ @collamine_upload_url = options[:upload]
12
+ end
9
13
  #
10
14
  # Try downloading the content from CollaMine servers
11
15
  #
12
16
  def self.try_collamine(url)
13
- uri = URI.parse(COLLAMINE_DOWNLOAD_URL+CGI::escape(url.to_s))
17
+ uri = URI.parse(@collamine_download_url+CGI::escape(url.to_s))
14
18
  Net::HTTP.start(uri.host, uri.port) do |http|
15
19
  response = http.get(uri)
16
20
  case response
@@ -25,13 +29,13 @@ class Request
25
29
  # Upload the content to Collamine servers
26
30
  #
27
31
  def self.upload_to_collamine(url, content, filename, crawltime)
28
- post_request = Net::HTTP::Post::Multipart.new COLLAMINE_UPLOAD_URL,
32
+ post_request = Net::HTTP::Post::Multipart.new @collamine_upload_url,
29
33
  'domain' => Domainatrix.parse(url).domain,
30
34
  'url' => url,
31
35
  'crawltime' => crawltime,
32
36
  'contributor' => 'belson',
33
37
  'document' => UploadIO.new(StringIO.new(content.encode('UTF-8', 'ISO-8859-15')), 'text/html', filename)
34
- response = Net::HTTP.start(URI.parse(COLLAMINE_UPLOAD_URL).host, URI.parse(COLLAMINE_UPLOAD_URL).port) { |http| http.request(post_request) }
38
+ response = Net::HTTP.start(URI.parse(@collamine_upload_url).host, URI.parse(@collamine_upload_url).port) { |http| http.request(post_request) }
35
39
  puts response.body
36
40
  end
37
41
  end
@@ -1,3 +1,3 @@
1
1
  module Collamine
2
- VERSION = "0.2.1"
2
+ VERSION = "0.3.0"
3
3
  end
data/test.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  require 'collamine'
2
-
2
+ require 'mongo'
3
+ require 'domainatrix'
3
4
  #http://forums.hardwarezone.com.sg/hwm-magazine-publication-38/
4
5
  #http://forums.hardwarezone.com.sg/money-mind-210/
5
6
  #http://sgforums.com/forums/4
@@ -13,10 +14,37 @@ require 'collamine'
13
14
  #https://www.apple.com/sg/
14
15
  #http://forums.hardwarezone.com.sg/current-affairs-lounge-17/
15
16
 
17
+ include Mongo
18
+ STORE = MongoClient.new("localhost", 27017).db("smartcache").collection("html")
19
+
20
+ Collamine.before_fetch do |url|
21
+ puts "Do what you want to the url: #{url}"
22
+ end
23
+
24
+ Collamine.after_fetch do |page, from_collamine|
25
+ # Check if duplicate
26
+ unless STORE.find("url" => page.url).to_a.size > 0
27
+ # Insert into Mongodb
28
+ puts "Insert to db: #{page.url}"
29
+ source = (from_collamine.include?(page.url) ? 'collamine' : 'original')
30
+ doc = {:url => page.url,
31
+ :domain => Domainatrix.parse(page.url).domain,
32
+ :source => source,
33
+ :content => page.content.encode('UTF-8', 'ISO-8859-15'),
34
+ :crawled_date => page.crawled_time.to_i,
35
+ :response_time => page.response_time.to_i
36
+ }
37
+ STORE.insert(doc)
38
+ else
39
+ puts "url exists"
40
+ end
41
+ end
42
+
16
43
  pages, from_collamine = Collamine.start('http://forums.hardwarezone.com.sg/money-mind-210/',
17
- :parallel => true,
18
- :threads => 10,
19
- :pattern => Regexp.new('^http:\/\/forums\.hardwarezone\.com\.sg\/money-mind-210\/?(.*\.html)?$'))
44
+ :pattern => Regexp.new('^http:\/\/forums\.hardwarezone\.com\.sg\/money-mind-210\/?(.*\.html)?$'),
45
+ :download => 'http://172.20.131.150:9001/download/html/',
46
+ :upload => 'http://172.20.131.150:9001/upload/html/multipart/',
47
+ :parallel => true, :threads => 10)
20
48
 
21
49
  puts "Total pages crawled: #{pages.size}"
22
50
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: collamine
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Belson Heng
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-03-10 00:00:00.000000000 Z
11
+ date: 2016-03-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -80,34 +80,6 @@ dependencies:
80
80
  - - ">="
81
81
  - !ruby/object:Gem::Version
82
82
  version: '0'
83
- - !ruby/object:Gem::Dependency
84
- name: mongo
85
- requirement: !ruby/object:Gem::Requirement
86
- requirements:
87
- - - ">="
88
- - !ruby/object:Gem::Version
89
- version: '0'
90
- type: :runtime
91
- prerelease: false
92
- version_requirements: !ruby/object:Gem::Requirement
93
- requirements:
94
- - - ">="
95
- - !ruby/object:Gem::Version
96
- version: '0'
97
- - !ruby/object:Gem::Dependency
98
- name: bson_ext
99
- requirement: !ruby/object:Gem::Requirement
100
- requirements:
101
- - - ">="
102
- - !ruby/object:Gem::Version
103
- version: '0'
104
- type: :runtime
105
- prerelease: false
106
- version_requirements: !ruby/object:Gem::Requirement
107
- requirements:
108
- - - ">="
109
- - !ruby/object:Gem::Version
110
- version: '0'
111
83
  description: Collamine is a ruby gem for CollaMine client, which communicates with
112
84
  CollaMine servers to download content from their SmartCache if it exists.
113
85
  email:
@@ -146,7 +118,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
146
118
  version: '0'
147
119
  requirements: []
148
120
  rubyforge_project:
149
- rubygems_version: 2.4.5
121
+ rubygems_version: 2.2.2
150
122
  signing_key:
151
123
  specification_version: 4
152
124
  summary: Collamine lets you crawl a web site using SpiderCrawl library and share the