collamine 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 6b539a01cbb2e35c410520e1d2ad60e6a82c6635
4
+ data.tar.gz: 126dc784195a7c51efac6d7047875fa63a9e049e
5
+ SHA512:
6
+ metadata.gz: 53d249cb2200ff324cd356c98929f439873105089a69e8ea98aad49af7e36eaee1fffc74a42fa286c1fc9b377f44bf4d92e1f3a3849954540dde3e073262af74
7
+ data.tar.gz: 53fea6d874d66d6d48b3ac4c6453360e4409c30cd6b3eacc01873bc620f389b488c96420f75bb01c3079f94237cee3209fb220ce73a18fe8c8d94763d1bd26b6
data/.gitignore ADDED
@@ -0,0 +1,15 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
10
+ *.bundle
11
+ *.so
12
+ *.o
13
+ *.a
14
+ mkmf.log
15
+ .DS_Store
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in collamine.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2014 Belson Heng
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,35 @@
1
+ # Collamine
2
+
3
+ This is a ruby gem for CollaMine client, which communicates with CollaMine servers to download content from their SmartCache if it exists. It also lets you crawl a web site using [SpiderCrawl](https://github.com/belsonheng/spidercrawl) and share the results with the community via CollaMine servers.
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ ```ruby
10
+ gem 'collamine'
11
+ ```
12
+
13
+ And then execute:
14
+
15
+ $ bundle
16
+
17
+ Or install it yourself as:
18
+
19
+ $ gem install collamine
20
+
21
+ ## Usage
22
+
23
+ To get started,
24
+
25
+ require 'collamine'
26
+ Collamine.start('http://forums.hardwarezone.com.sg/hwm-magazine-publication-38/',
27
+ :pattern => Regexp.new('^http:\/\/forums\.hardwarezone\.com\.sg\/hwm-magazine-publication-38\/?(.*\.html)?$'))
28
+
29
+ ## Contributing
30
+
31
+ 1. Fork it ( https://github.com/belsonheng/collamine/fork )
32
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
33
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
34
+ 4. Push to the branch (`git push origin my-new-feature`)
35
+ 5. Create a new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ require "bundler/gem_tasks"
2
+
data/collamine.gemspec ADDED
@@ -0,0 +1,28 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'collamine/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "collamine"
8
+ spec.version = Collamine::VERSION
9
+ spec.authors = ["Belson Heng"]
10
+ spec.email = ["belsonheng@gmail.com"]
11
+ spec.summary = %q{Collamine lets you crawl a web site and share the results with the community via CollaMine servers.}
12
+ spec.description = %q{Collamine is a ruby gem for CollaMine client, which communicates with CollaMine servers to download content from their SmartCache if it exists.}
13
+ spec.homepage = "http://github.com/belsonheng/collamine/"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.7"
22
+ spec.add_development_dependency "rake", "~> 10.0"
23
+
24
+ spec.add_dependency "multipart-post"
25
+ spec.add_dependency "domainatrix"
26
+ spec.add_dependency "mongo"
27
+ spec.add_dependency "bson_ext"
28
+ end
@@ -0,0 +1,37 @@
1
+ require 'cgi'
2
+ require 'net/http/post/multipart'
3
+ require 'domainatrix'
4
+
5
+ # Makes the request to CollaMine servers
6
+ class Request
7
+ COLLAMINE_DOWNLOAD_URL = 'http://172.31.22.135:9001/download/html/'
8
+ COLLAMINE_UPLOAD_URL = 'http://172.31.22.135:9001/upload/html/multipart/'
9
+ #
10
+ # Try downloading the content from CollaMine servers
11
+ #
12
+ def self.try_collamine(url)
13
+ uri = URI.parse(COLLAMINE_DOWNLOAD_URL+CGI::escape(url.to_s))
14
+ Net::HTTP.start(uri.host, uri.port) do |http|
15
+ response = http.get(uri)
16
+ case response
17
+ when Net::HTTPSuccess then
18
+ return nil if response.body == 'not found' rescue nil
19
+ return response
20
+ else nil
21
+ end
22
+ end
23
+ end
24
+ #
25
+ # Upload the content to Collamine servers
26
+ #
27
+ def self.upload_to_collamine(url, content, filename, crawltime)
28
+ post_request = Net::HTTP::Post::Multipart.new COLLAMINE_UPLOAD_URL,
29
+ 'domain' => Domainatrix.parse(url).domain,
30
+ 'url' => url,
31
+ 'crawltime' => crawltime,
32
+ 'contributor' => 'belson',
33
+ 'document' => UploadIO.new(StringIO.new(content.encode('UTF-8', 'ISO-8859-15')), 'text/html', filename)
34
+ response = Net::HTTP.start(URI.parse(COLLAMINE_UPLOAD_URL).host, URI.parse(COLLAMINE_UPLOAD_URL).port) { |http| http.request(post_request) }
35
+ puts response.body
36
+ end
37
+ end
@@ -0,0 +1,3 @@
1
+ module Collamine
2
+ VERSION = "0.2.0"
3
+ end
data/lib/collamine.rb ADDED
@@ -0,0 +1,60 @@
1
+ require 'collamine/request'
2
+ require 'spidercrawl'
3
+
4
+ require 'curb'
5
+ require 'domainatrix'
6
+ require 'mongo'
7
+
8
+ class Collamine
9
+ include Mongo
10
+ STORE = MongoClient.new("localhost", 27017).db("smartcache").collection("html")
11
+
12
+ def self.start(url, options)
13
+ from_collamine = []
14
+ pages = Spiderman.shoot(url, options) do |web|
15
+ collamine = nil
16
+ web.before_fetch do |url|
17
+ # Try to fetch from collamine server
18
+ puts "trying collamine: #{url}"
19
+ puts "fetched from collamine: #{url}" if (collamine = Request.try_collamine(url))
20
+ from_collamine << url if collamine
21
+ collamine
22
+ end
23
+ web.on_redirect do |url|
24
+ # Try to fetch from collamine server
25
+ puts "trying collamine: #{url}"
26
+ puts "fetched from collamine: #{url}" if (collamine = Request.try_collamine(url))
27
+ from_collamine << url if collamine
28
+ collamine
29
+ end
30
+ web.after_fetch do |page|
31
+ unless page.content == ''
32
+ # Upload to collamine if it cannot be found in server
33
+ unless collamine
34
+ puts "uploading to collamine: #{page.url}"
35
+ filename = page.url.split('/').last
36
+ filename += '.html' unless filename.include?('.html')
37
+ Request.upload_to_collamine(page.url, page.content, filename, page.crawled_time.to_i)
38
+ end
39
+ # Check if duplicate
40
+ unless STORE.find("url" => page.url).to_a.size > 0
41
+ # Insert into Mongodb
42
+ puts "Insert to db: #{page.url}"
43
+ source = (from_collamine.include?(page.url) ? 'collamine' : 'original')
44
+ doc = {:url => page.url,
45
+ :domain => Domainatrix.parse(page.url).domain,
46
+ :source => source,
47
+ :content => page.content.encode('UTF-8', 'ISO-8859-15'),
48
+ :crawled_date => page.crawled_time.to_i,
49
+ :response_time => page.response_time.to_i
50
+ }
51
+ STORE.insert(doc)
52
+ else
53
+ puts "url exists"
54
+ end
55
+ end
56
+ end
57
+ end
58
+ return pages, from_collamine
59
+ end
60
+ end
data/test.rb ADDED
@@ -0,0 +1,28 @@
1
+ require 'collamine'
2
+
3
+ #http://forums.hardwarezone.com.sg/hwm-magazine-publication-38/
4
+ #http://forums.hardwarezone.com.sg/money-mind-210/
5
+ #http://sgforums.com/forums/4
6
+ #http://forums.vr-zone.com/photography-lightroom/
7
+ #http://forums.gumi.sg/forum/news-boards
8
+ #http://en.forums.wordpress.com/
9
+ #http://www.spcnet.tv/forums/showthread.php/38762-Dugu-Jiu-Jian-Really-Unbeatable
10
+ #http://www.hungrygowhere.com/
11
+ #http://www.groupon.sg/
12
+ #http://www.amazon.com/
13
+ #https://www.apple.com/sg/
14
+ #http://forums.hardwarezone.com.sg/current-affairs-lounge-17/
15
+
16
+ pages, from_collamine = Collamine.start('http://forums.hardwarezone.com.sg/money-mind-210/',
17
+ :parallel => true,
18
+ :threads => 10,
19
+ :pattern => Regexp.new('^http:\/\/forums\.hardwarezone\.com\.sg\/money-mind-210\/?(.*\.html)?$'))
20
+
21
+ puts "Total pages crawled: #{pages.size}"
22
+
23
+ open('/tmp/ruby.log', 'w') do |f|
24
+ pages.each do |page|
25
+ f << "#{page.url}\n"
26
+ end
27
+ f << "Total pages crawled: #{pages.size}"
28
+ end
metadata ADDED
@@ -0,0 +1,140 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: collamine
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.2.0
5
+ platform: ruby
6
+ authors:
7
+ - Belson Heng
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-07-31 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.7'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.7'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: multipart-post
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: domainatrix
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: mongo
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: bson_ext
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ description: Collamine is a ruby gem for CollaMine client, which communicates with
98
+ CollaMine servers to download content from their SmartCache if it exists.
99
+ email:
100
+ - belsonheng@gmail.com
101
+ executables: []
102
+ extensions: []
103
+ extra_rdoc_files: []
104
+ files:
105
+ - ".gitignore"
106
+ - Gemfile
107
+ - LICENSE.txt
108
+ - README.md
109
+ - Rakefile
110
+ - collamine.gemspec
111
+ - lib/collamine.rb
112
+ - lib/collamine/request.rb
113
+ - lib/collamine/version.rb
114
+ - test.rb
115
+ homepage: http://github.com/belsonheng/collamine/
116
+ licenses:
117
+ - MIT
118
+ metadata: {}
119
+ post_install_message:
120
+ rdoc_options: []
121
+ require_paths:
122
+ - lib
123
+ required_ruby_version: !ruby/object:Gem::Requirement
124
+ requirements:
125
+ - - ">="
126
+ - !ruby/object:Gem::Version
127
+ version: '0'
128
+ required_rubygems_version: !ruby/object:Gem::Requirement
129
+ requirements:
130
+ - - ">="
131
+ - !ruby/object:Gem::Version
132
+ version: '0'
133
+ requirements: []
134
+ rubyforge_project:
135
+ rubygems_version: 2.4.5
136
+ signing_key:
137
+ specification_version: 4
138
+ summary: Collamine lets you crawl a web site and share the results with the community
139
+ via CollaMine servers.
140
+ test_files: []