apollo-crawler 0.1.30 → 0.1.31
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/apollo_crawler.rb +0 -3
- data/lib/apollo_crawler/lib.rb +0 -3
- data/lib/apollo_crawler/{adapter/amqp_adapter.rb → model/data_chunk.rb} +17 -4
- data/lib/apollo_crawler/model/models.rb +2 -0
- data/lib/apollo_crawler/{adapter/mongo_adapter.rb → model/user.rb} +20 -5
- data/lib/apollo_crawler/planner/smart_planner.rb +7 -0
- data/lib/apollo_crawler/version.rb +1 -1
- metadata +3 -4
- data/lib/apollo_crawler/adapter/adapters.rb +0 -22
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c51a69932406b288595a4063140629cf633bc893
|
4
|
+
data.tar.gz: bb1ad85084a5b0c3188301e2339ae1656c8807c5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6bf291b011220dc67e0f5bcaad0f9f3744e0cf607638372a441fb9f63752e6b65b82b384787ea3d506862c40a10ea3799e370e4b93e0282605aa6e2af602cdec
|
7
|
+
data.tar.gz: 48ea2fe6c144b771f72bddfa9012d227aa8f349fd04fea3378f2590cdd6ba30ef99ddf186dd199e450d94d6ee0b15dbb0cdff94cdc77c6fe849db3ef7c622f90
|
data/lib/apollo_crawler.rb
CHANGED
@@ -24,9 +24,6 @@
|
|
24
24
|
# Config First
|
25
25
|
require File.join(File.dirname(__FILE__), 'apollo_crawler/env')
|
26
26
|
|
27
|
-
# Adapters
|
28
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/adapter/adapters')
|
29
|
-
|
30
27
|
# Agents
|
31
28
|
require File.join(File.dirname(__FILE__), 'apollo_crawler/agent/agents')
|
32
29
|
|
data/lib/apollo_crawler/lib.rb
CHANGED
@@ -18,9 +18,22 @@
|
|
18
18
|
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
19
|
# THE SOFTWARE.
|
20
20
|
|
21
|
+
require File.join(File.dirname(__FILE__), 'base_model')
|
22
|
+
|
21
23
|
module Apollo
|
22
|
-
module
|
23
|
-
class
|
24
|
-
|
25
|
-
|
24
|
+
module Model
|
25
|
+
class DataChunk < BaseModel
|
26
|
+
include Mongoid::Document
|
27
|
+
include Mongoid::Timestamps
|
28
|
+
|
29
|
+
store_in collection: "data_chunks"
|
30
|
+
|
31
|
+
field :data
|
32
|
+
field :crawler
|
33
|
+
field :data_hash
|
34
|
+
|
35
|
+
# Indexes
|
36
|
+
index({ created_at: 1, updated_at: 1, crawler: 1, data_hash: 1})
|
37
|
+
end # class DataSource
|
38
|
+
end # module Model
|
26
39
|
end # module Apollo
|
@@ -20,7 +20,9 @@
|
|
20
20
|
|
21
21
|
require File.join(File.dirname(__FILE__), 'base_model')
|
22
22
|
require File.join(File.dirname(__FILE__), 'crawler')
|
23
|
+
require File.join(File.dirname(__FILE__), 'data_chunk')
|
23
24
|
require File.join(File.dirname(__FILE__), 'data_source')
|
24
25
|
require File.join(File.dirname(__FILE__), 'domain')
|
25
26
|
require File.join(File.dirname(__FILE__), 'queued_url')
|
26
27
|
require File.join(File.dirname(__FILE__), 'raw_document')
|
28
|
+
require File.join(File.dirname(__FILE__), 'user')
|
@@ -18,9 +18,24 @@
|
|
18
18
|
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
19
|
# THE SOFTWARE.
|
20
20
|
|
21
|
+
require File.join(File.dirname(__FILE__), 'base_model')
|
22
|
+
|
21
23
|
module Apollo
|
22
|
-
module
|
23
|
-
class
|
24
|
-
|
25
|
-
|
26
|
-
|
24
|
+
module Model
|
25
|
+
class User < BaseModel
|
26
|
+
include Mongoid::Document
|
27
|
+
include Mongoid::Timestamps
|
28
|
+
|
29
|
+
store_in collection: "users"
|
30
|
+
|
31
|
+
field :nick
|
32
|
+
field :email
|
33
|
+
|
34
|
+
# Indexes
|
35
|
+
index({ created_at: 1, updated_at: 1})
|
36
|
+
index({ nick: 1 }, { unique: true, background: true })
|
37
|
+
index({ email: 1 }, { unique: true, background: true })
|
38
|
+
|
39
|
+
end # class User
|
40
|
+
end # module Model
|
41
|
+
end # module Apollo
|
@@ -47,6 +47,10 @@ module Apollo
|
|
47
47
|
# Bindings
|
48
48
|
declarations[:queues]["planner.fetched.queue"].bind(declarations[:exchanges]["planner.fetched"]).subscribe do |delivery_info, metadata, payload|
|
49
49
|
msg = JSON.parse(payload)
|
50
|
+
puts "#{msg.inspect}" if opts[:verbose]
|
51
|
+
|
52
|
+
puts "REQ: #{msg['request']}" if opts[:verbose]
|
53
|
+
puts "RESP: #{msg['response']}" if opts[:verbose]
|
50
54
|
|
51
55
|
request = msg['request']
|
52
56
|
response = msg['response']
|
@@ -98,6 +102,9 @@ module Apollo
|
|
98
102
|
links = msg['links']
|
99
103
|
links = [] if links.nil?
|
100
104
|
|
105
|
+
data_hash = Digest::SHA256.new.update(data).hexdigest
|
106
|
+
puts "#{data_hash}"
|
107
|
+
|
101
108
|
links.each do |url|
|
102
109
|
link = url['link']
|
103
110
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: apollo-crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.31
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tomas Korcak
|
@@ -410,9 +410,6 @@ files:
|
|
410
410
|
- ./config/mongoid.yml
|
411
411
|
- ./config/mongoid.yml.default
|
412
412
|
- ./lib/apollo_crawler.rb
|
413
|
-
- ./lib/apollo_crawler/adapter/adapters.rb
|
414
|
-
- ./lib/apollo_crawler/adapter/amqp_adapter.rb
|
415
|
-
- ./lib/apollo_crawler/adapter/mongo_adapter.rb
|
416
413
|
- ./lib/apollo_crawler/agent/agents.rb
|
417
414
|
- ./lib/apollo_crawler/agent/base_agent.rb
|
418
415
|
- ./lib/apollo_crawler/agent/crawler_agent.rb
|
@@ -457,11 +454,13 @@ files:
|
|
457
454
|
- ./lib/apollo_crawler/logger/loggers.rb
|
458
455
|
- ./lib/apollo_crawler/model/base_model.rb
|
459
456
|
- ./lib/apollo_crawler/model/crawler.rb
|
457
|
+
- ./lib/apollo_crawler/model/data_chunk.rb
|
460
458
|
- ./lib/apollo_crawler/model/data_source.rb
|
461
459
|
- ./lib/apollo_crawler/model/domain.rb
|
462
460
|
- ./lib/apollo_crawler/model/models.rb
|
463
461
|
- ./lib/apollo_crawler/model/queued_url.rb
|
464
462
|
- ./lib/apollo_crawler/model/raw_document.rb
|
463
|
+
- ./lib/apollo_crawler/model/user.rb
|
465
464
|
- ./lib/apollo_crawler/planner/base_planner.rb
|
466
465
|
- ./lib/apollo_crawler/planner/planners.rb
|
467
466
|
- ./lib/apollo_crawler/planner/smart_planner.rb
|
@@ -1,22 +0,0 @@
|
|
1
|
-
# Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
|
2
|
-
#
|
3
|
-
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
-
# of this software and associated documentation files (the "Software"), to deal
|
5
|
-
# in the Software without restriction, including without limitation the rights
|
6
|
-
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
-
# copies of the Software, and to permit persons to whom the Software is
|
8
|
-
# furnished to do so, subject to the following conditions:
|
9
|
-
#
|
10
|
-
# The above copyright notice and this permission notice shall be included in
|
11
|
-
# all copies or substantial portions of the Software.
|
12
|
-
#
|
13
|
-
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
-
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
-
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
-
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
-
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
-
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
-
# THE SOFTWARE.
|
20
|
-
|
21
|
-
require File.join(File.dirname(__FILE__), 'amqp_adapter')
|
22
|
-
require File.join(File.dirname(__FILE__), 'mongo_adapter')
|