apollo-crawler 0.1.26 → 0.1.27

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 5e6fa7213e0e7f81364c5dbda6a8f53def1fda6e
4
- data.tar.gz: ecd79cd04f4a4331124b3910ef5e85da5f590692
3
+ metadata.gz: d6e559b9a330a556c3fb9eed416e0e83bdeb5d04
4
+ data.tar.gz: 289cf132fbd702c2c81d9517f95460aed5a629ee
5
5
  SHA512:
6
- metadata.gz: ae28d3adaca4125abdeee5ffbf73615a1efc6388e97882b67359846e81b6b58acca74fc08f59034c13f132892cca92b324876c20ab6680222aefc08b5e0d9ea1
7
- data.tar.gz: f452e71e138696125affbaa3ae646775857bf7b99a06a0656a85e08881e66fdbe357cd31eac4bc5f3030aadbf0075b4f602f280c669d65b139b14bc20ab974ec
6
+ metadata.gz: c150d48952a2061a2db796046016d0dc8cf0d3c858a41655995f3dea3b712cbc9470db75461a5598541313ea1de5a1325152579235a17f3c10911745eec7c532
7
+ data.tar.gz: 40bca94e6c247dcebd06fff4669dce6631a0c8096a3f26b4a3d68da7d03f1c1b9d0388fa067d46907291e64ec49d2ccd4f9c33fef3076c271816172f20d387bb
@@ -0,0 +1,187 @@
1
+ #! /usr/bin/env ruby
2
+ # encoding: utf-8
3
+
4
+ # Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
5
+ #
6
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ # of this software and associated documentation files (the "Software"), to deal
8
+ # in the Software without restriction, including without limitation the rights
9
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ # copies of the Software, and to permit persons to whom the Software is
11
+ # furnished to do so, subject to the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be included in
14
+ # all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
+ # THE SOFTWARE.
23
+
24
+ require "rubygems"
25
+ require "bundler/setup"
26
+
27
+ require 'fog'
28
+
29
+ require 'fileutils'
30
+ require 'yaml'
31
+
32
+ module Apollo
33
+ class AwsConsole
34
+ AWS_CONFIG_FILE = File.expand_path("~/.apollo/config/apollo-aws.yml")
35
+ AWS_DEFAULT_CONFIG = {
36
+ :provider => "aws",
37
+ :aws_access_key_id => "",
38
+ :aws_secret_access_key => "",
39
+ :aws_default_instance => "i-a4039bee",
40
+ :region => "eu-west-1"
41
+ }
42
+
43
+ attr_accessor :connection
44
+ attr_accessor :config
45
+
46
+ def initialize
47
+ self.connection = nil
48
+ self.config = nil
49
+ end
50
+
51
+ def config_init(path)
52
+ self.config = AWS_DEFAULT_CONFIG
53
+
54
+ AWS_DEFAULT_CONFIG.each do |k, v|
55
+ print "Override '#{k}' (default: '#{v}') ? > "
56
+ val = STDIN.gets.chomp!
57
+ if val.empty?
58
+ self.config[k] = v
59
+ else
60
+ self.config[k] = val
61
+ end
62
+ end
63
+
64
+ config_save(AWS_CONFIG_FILE, self.config)
65
+ end
66
+
67
+ def config_save(path, config)
68
+ dir = File.dirname(path)
69
+ if(Dir.exists?(dir) == false)
70
+ FileUtils.mkpath(dir)
71
+ end
72
+
73
+ File.open(path, 'w+') do |f|
74
+ f.write(config.to_yaml)
75
+ end
76
+
77
+ return config
78
+ end
79
+
80
+ def config_load(path)
81
+ self.config = YAML.load_file(path)
82
+ end
83
+
84
+ def connect()
85
+ return self.connection if self.connection
86
+
87
+ aws_provider_keys = [:provider, :aws_access_key_id, :aws_secret_access_key, :region]
88
+ config = self.config.reject { |key, value| !aws_provider_keys.include?(key) }
89
+
90
+ begin
91
+ self.connection = Fog::Compute.new(config)
92
+ rescue Exception => e
93
+ puts "Unable to connect to AWS, reason: #{e.to_s}"
94
+ return -1
95
+ end
96
+
97
+ return self.connection
98
+ end
99
+
100
+ def get_instance(instance_id)
101
+ puts "Getting instance '#{instance_id}'"
102
+ self.connection.servers.get(instance_id)
103
+ end
104
+
105
+ def run_cmd(cmd)
106
+ if cmd == "help"
107
+ puts "Supported commands - init, info, interactive, list, start, stop"
108
+ return 0
109
+ end
110
+
111
+ if self.config.nil?
112
+ if(File.exists?(AWS_CONFIG_FILE) == false)
113
+ self.config = config_init(AWS_CONFIG_FILE)
114
+ else
115
+ self.config = config_load(AWS_CONFIG_FILE)
116
+ end
117
+ end
118
+
119
+ case cmd
120
+
121
+ when "info"
122
+ instance = self.config[:aws_default_instance]
123
+ puts "Inspecting instance '#{instance}'"
124
+ connect()
125
+ server = get_instance(instance)
126
+ puts server.inspect
127
+ return 0
128
+
129
+ when "init"
130
+ config_init(AWS_CONFIG_FILE)
131
+ return 0
132
+
133
+ when "list"
134
+ connect()
135
+ instance_list = connection.servers.all
136
+ instance_list.table([:id, :flavor_id, :public_ip_address, :private_ip_address, :image_id ])
137
+ return 0
138
+
139
+ when "start"
140
+ instance = self.config[:aws_default_instance]
141
+ puts "Starting instance '#{instance}'"
142
+ connect()
143
+ server = get_instance(instance)
144
+ res = server.start
145
+ puts " => Success: #{res}"
146
+ puts " => DNS: #{server.dns_name}"
147
+ return 0
148
+
149
+ when "stop"
150
+ instance = self.config[:aws_default_instance]
151
+ puts "Stopping instance '#{instance}'"
152
+ connect()
153
+ server = get_instance(instance)
154
+ res = server.stop
155
+ puts " => Success: #{res}"
156
+ puts " => DNS: #{server.dns_name}"
157
+ return 0
158
+ end
159
+
160
+ puts "Unknown command '#{cmd}'"
161
+ return -1
162
+ end
163
+
164
+ def run()
165
+ cmd = ARGV.length > 0 ? ARGV[0] : "help"
166
+
167
+ if cmd == "interactive"
168
+ print "> "
169
+ while cmd = STDIN.gets.chomp!
170
+ break if cmd.downcase == "quit"
171
+
172
+ res = run_cmd(cmd)
173
+
174
+ print "> "
175
+ end
176
+
177
+ return 0
178
+ end
179
+
180
+ return run_cmd(cmd)
181
+ end
182
+ end
183
+ end
184
+
185
+ if __FILE__ == $0
186
+ Apollo::AwsConsole::new.run()
187
+ end
@@ -1,26 +1,23 @@
1
- default: &default_options
1
+ default:
2
2
  sessions:
3
3
  default:
4
4
  hosts:
5
5
  - apollo-crawler.no-ip.org:27017
6
-
7
6
  development:
8
7
  sessions:
9
8
  default:
10
9
  hosts:
11
10
  - localhost:27017
12
11
  database: apollo-crawler-development
13
-
14
12
  test:
15
13
  sessions:
16
14
  default:
17
- hosts:
18
- - apollo-crawler.no-ip.org:27017
19
- database: apollo-crawler-test
20
-
15
+ hosts:
16
+ - apollo-crawler.no-ip.org:27017
17
+ database: apollo-crawler-test
21
18
  production:
22
19
  sessions:
23
20
  default:
24
- hosts:
25
- - apollo-crawler.no-ip.org:27017
26
- database: apollo-crawler-production
21
+ hosts:
22
+ - apollo-crawler.no-ip.org:27017
23
+ database: apollo-crawler-production
@@ -37,18 +37,20 @@ module Apollo
37
37
 
38
38
  # Declarations
39
39
  channel = amqp.create_channel
40
- declarations = Apollo::Agent.declare_exchanges(channel, opts)
41
-
40
+ self.declarations = Apollo::Agent.declare_entities(channel, opts)# Binding
41
+
42
42
  # Binding
43
- declarations[:queues]["crawler.queue"].bind(declarations[:exchanges]["crawler"]).subscribe do |delivery_info, metadata, payload|
44
- puts "CrawlerAgent: Received, metadata #{metadata.inspect}" if opts[:verbose]
43
+ queue = self.declarations[:queues]["crawler.queue"]
44
+ exchange = self.declarations[:exchanges]["crawler"]
45
45
 
46
+ queue.bind(exchange).subscribe do |delivery_info, metadata, payload|
46
47
  msg = JSON.parse(payload)
47
48
 
48
49
  request = msg['request']
49
50
  response = msg['response']
51
+ url = request["url"]
50
52
 
51
- # puts "PLANEEEER: #{msg.inspect}"
53
+ puts "CrawlerAgent: Received - '#{url}', metadata #{metadata.inspect}" if opts[:verbose]
52
54
 
53
55
  doc = Nokogiri::HTML(response['body'])
54
56
  crawler = request['crawler_name'].constantize.new
@@ -59,7 +61,7 @@ module Apollo
59
61
  # puts res.inspect
60
62
 
61
63
  if(metadata[:reply_to] != nil)
62
- x = declarations[:exchanges][metadata[:reply_to]]
64
+ x = self.declarations[:exchanges][metadata[:reply_to]]
63
65
 
64
66
  msg = {
65
67
  :request => request,
@@ -37,7 +37,7 @@ module Apollo
37
37
 
38
38
  # Declarations
39
39
  channel = amqp.create_channel
40
- declarations = Apollo::Agent.declare_exchanges(channel, opts)
40
+ declarations = Apollo::Agent.declare_entities(channel, opts)
41
41
 
42
42
  # Binding
43
43
  declarations[:queues]["domainer.queue"].bind(declarations[:exchanges]["domainer"]).subscribe do |delivery_info, metadata, payload|
@@ -20,6 +20,23 @@
20
20
 
21
21
  module Apollo
22
22
  module Agent
23
+ def self.declare_queues(channel, opts={})
24
+ if(opts[:verbose])
25
+ puts "Declaring AMQP Queues"
26
+ end
27
+
28
+ # Queues
29
+ queues = {}
30
+ queues["crawler.queue"] = channel.queue("crawler.queue", :auto_delete => false, :durable => true)
31
+ queues["domainer.queue"] = channel.queue("domainer.queue", :auto_delete => false, :durable => true)
32
+ queues["fetcher.queue"] = channel.queue("fetcher.queue", :auto_delete => false, :durable => true)
33
+ queues["planner.crawled.queue"] = channel.queue("planner.crawled.queue", :auto_delete => false, :durable => true)
34
+ queues["planner.domained.queue"] = channel.queue("planner.domained.queue", :auto_delete => false, :durable => true)
35
+ queues["planner.fetched.queue"] = channel.queue("planner.fetched.queue", :auto_delete => false, :durable => true)
36
+
37
+ return queues
38
+ end
39
+
23
40
  def self.declare_exchanges(channel, opts={})
24
41
  if(opts[:verbose])
25
42
  puts "Declaring AMQP Exchanges"
@@ -34,14 +51,12 @@ module Apollo
34
51
  exchanges["planner.domained"] = channel.direct("planner.domained", :auto_delete => false, :durable => true)
35
52
  exchanges["planner.fetched"] = channel.direct("planner.fetched", :auto_delete => false, :durable => true)
36
53
 
37
- # Queues
38
- queues = {}
39
- queues["crawler.queue"] = channel.queue("crawler.queue", :auto_delete => false, :durable => true)
40
- queues["domainer.queue"] = channel.queue("domainer.queue", :auto_delete => false, :durable => true)
41
- queues["fetcher.queue"] = channel.queue("fetcher.queue", :auto_delete => false, :durable => true)
42
- queues["planner.crawled.queue"] = channel.queue("planner.crawled.queue", :auto_delete => false, :durable => true)
43
- queues["planner.domained.queue"] = channel.queue("planner.domained.queue", :auto_delete => false, :durable => true)
44
- queues["planner.fetched.queue"] = channel.queue("planner.fetched.queue", :auto_delete => false, :durable => true)
54
+ return exchanges
55
+ end
56
+
57
+ def self.declare_entities(channel, opts={})
58
+ exchanges = self.declare_exchanges(channel, opts)
59
+ queues = self.declare_queues(channel, opts)
45
60
 
46
61
  # Compose res
47
62
  res = {
@@ -23,17 +23,20 @@ require File.join(File.dirname(__FILE__), 'base_agent')
23
23
 
24
24
  require File.join(File.dirname(__FILE__), '../fetcher/fetchers')
25
25
 
26
+ require 'amqp'
27
+ require 'amqp/extensions/rabbitmq'
28
+
26
29
  require 'digest/sha1'
27
- require 'thread/pool'
30
+ require 'thread'
28
31
 
29
32
  module Apollo
30
33
  module Agent
31
34
  class FetcherAgent < BaseAgent
32
- THREAD_POOL_SIZE = 10
35
+ THREAD_POOL_SIZE = 1
33
36
 
34
37
  attr_accessor :fetcher
35
38
  attr_accessor :declarations
36
- attr_accessor :thread_pool
39
+ attr_accessor :mutex
37
40
 
38
41
  def initialize(amqp, opts={})
39
42
  self.fetcher = Apollo::Fetcher::SmartFetcher.new
@@ -42,46 +45,54 @@ module Apollo
42
45
  puts "Initializing fetcher agent..."
43
46
  end
44
47
 
45
- thread_pool = Thread::Pool.new(THREAD_POOL_SIZE)
46
-
47
48
  # Declarations
48
49
  channel = amqp.create_channel
49
- declarations = Apollo::Agent.declare_exchanges(channel, opts)
50
+ channel.prefetch(THREAD_POOL_SIZE)
51
+
52
+ # Binding (Default)
53
+ self.declarations = Apollo::Agent.declare_entities(channel, opts)
54
+ queue = declarations[:queues]["fetcher.queue"]
55
+
56
+ # AMQP contexts for threads
57
+ contexts = []
58
+ (0...THREAD_POOL_SIZE).each do |i|
59
+ puts "FetcherAgent::initialize() - Creating context #{i}" if opts[:verbose]
60
+ end
50
61
 
51
- # Binding
52
- declarations[:queues]["fetcher.queue"].bind(declarations[:exchanges]["fetcher"]).subscribe do |delivery_info, metadata, payload|
53
- thread_pool.process {
54
- queued_url = JSON.parse(payload)
55
- url = queued_url["url"]
62
+ # AMQP contexts mutex/lock
63
+ self.mutex = Mutex.new()
56
64
 
57
- puts "FetcherAgent: Received - '#{url}', metadata #{metadata.inspect}" if opts[:verbose]
65
+ exchange = self.declarations[:exchanges]["fetcher"]
58
66
 
59
- doc = nil
60
- begin
61
- doc = Apollo::Fetcher::SmartFetcher::fetch(url)
62
- rescue Exception => e
63
- puts "EXCEPTION: FetcherAgent::initialize() - Unable to fetch '#{url}', reason: '#{e.to_s}'"
64
- end
67
+ queue.bind(exchange).subscribe(:ack => true) do |delivery_info, metadata, payload|
68
+ # There can be troubles with concurency, please see https://groups.google.com/forum/?fromgroups=#!topic/ruby-amqp/aO9GPu-jxuE
69
+ queued_url = JSON.parse(payload)
70
+ url = queued_url["url"]
65
71
 
66
- doc = get_fetched_doc(queued_url, doc, metadata, opts)
67
-
68
- # send_response_msg(metadata[:reply_to], get_response_msg(queued_url, doc))
72
+ puts "FetcherAgent: Received - '#{url}', metadata #{metadata.inspect}" if opts[:verbose]
73
+ self.mutex.synchronize {
74
+ puts "FetcherAgent: Acking - '#{delivery_info.delivery_tag}'" if opts[:verbose]
75
+ channel.basic_ack(delivery_info.delivery_tag, true)
76
+ }
69
77
 
78
+ begin
79
+ doc = Apollo::Fetcher::SmartFetcher::fetch(url)
80
+ doc = process_fetched_doc(queued_url, doc, metadata, opts)
81
+
70
82
  if(metadata && metadata[:reply_to])
71
- exchange_name = metadata[:reply_to]
83
+ puts "Replying to '#{metadata[:reply_to]}'"
84
+ send_response_msg(metadata[:reply_to], queued_url, doc)
85
+ end
72
86
 
73
- if(exchange_name != nil)
74
- msg = get_response_msg(queued_url, doc)
87
+ rescue Exception => e
88
+ puts "EXCEPTION: FetcherAgent::initialize() - Unable to fetch '#{url}', reason: '#{e.to_s}'"
89
+ end
75
90
 
76
- x = declarations[:exchanges][exchange_name]
77
- x.publish(msg.to_json)
78
- end
79
- end
80
- }
91
+ doc
81
92
  end
82
93
  end
83
94
 
84
- def get_fetched_doc(queued_url, doc, metadata, opts={})
95
+ def process_fetched_doc(queued_url, doc, metadata, opts={})
85
96
  url = queued_url["url"]
86
97
 
87
98
  res = Apollo::Model::RawDocument.new
@@ -94,12 +105,23 @@ module Apollo
94
105
  return res
95
106
  end
96
107
 
97
- def get_response_msg(queued_url, doc)
108
+ def format_response_msg(queued_url, doc)
98
109
  return {
99
110
  :request => queued_url,
100
111
  :response => doc
101
112
  }
102
113
  end
114
+
115
+ def send_response_msg(dest, queued_url, doc)
116
+ if(dest != nil)
117
+ msg = format_response_msg(queued_url, doc)
118
+
119
+ self.mutex.synchronize {
120
+ exchange = self.declarations[:exchanges][dest]
121
+ exchange.publish(msg.to_json)
122
+ }
123
+ end
124
+ end
103
125
  end # class FetcherAgent
104
126
  end # module Agent
105
127
  end # module Apollo
@@ -22,6 +22,7 @@ require File.join(File.dirname(__FILE__), 'base_crawler')
22
22
  require File.join(File.dirname(__FILE__), 'google_crawler')
23
23
  require File.join(File.dirname(__FILE__), 'hacker_news_crawler')
24
24
  require File.join(File.dirname(__FILE__), 'slashdot_crawler')
25
+ require File.join(File.dirname(__FILE__), 'spider_crawler')
25
26
  require File.join(File.dirname(__FILE__), 'stackoverflow_crawler')
26
27
  require File.join(File.dirname(__FILE__), 'xkcd_crawler')
27
28
  require File.join(File.dirname(__FILE__), 'youjizz_crawler')
@@ -0,0 +1,52 @@
1
+ # Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
2
+ #
3
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ # of this software and associated documentation files (the "Software"), to deal
5
+ # in the Software without restriction, including without limitation the rights
6
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ # copies of the Software, and to permit persons to whom the Software is
8
+ # furnished to do so, subject to the following conditions:
9
+ #
10
+ # The above copyright notice and this permission notice shall be included in
11
+ # all copies or substantial portions of the Software.
12
+ #
13
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
+ # THE SOFTWARE.
20
+
21
+ require File.join(File.dirname(__FILE__), 'base_crawler')
22
+
23
+ module Apollo
24
+ module Crawler
25
+ class SpiderCrawler < BaseCrawler
26
+ def name()
27
+ return "Spider"
28
+ end
29
+
30
+ def url()
31
+ return "http://www.wikipedia.org/"
32
+ end
33
+
34
+ def extract_data(doc)
35
+ []
36
+ end
37
+
38
+ def extract_links(doc)
39
+ res = doc.xpath("//a").map { |node|
40
+ url = BaseCrawler.try_get_url(self.url, node['href']).to_s
41
+ next if url.nil?
42
+
43
+ {
44
+ :link => url
45
+ }
46
+ }
47
+
48
+ return res.uniq
49
+ end
50
+ end # class SpiderCrawler
51
+ end # module Crawler
52
+ end # module Apollo
@@ -19,6 +19,7 @@
19
19
  # THE SOFTWARE.
20
20
 
21
21
  require 'amqp'
22
+ require 'amqp/extensions/rabbitmq'
22
23
  require 'bunny'
23
24
  require 'thread'
24
25
 
@@ -21,6 +21,8 @@
21
21
  require 'mongo'
22
22
  require 'mongoid'
23
23
 
24
+ require 'csv'
25
+
24
26
  module Apollo
25
27
  module Helper
26
28
  module Mongo
@@ -37,6 +39,36 @@ module Apollo
37
39
 
38
40
  return res
39
41
  end
42
+
43
+ def self.csv_bulk_insert(path, model, bulk_size, validate=false, &block)
44
+ batch = []
45
+
46
+ CSV.foreach(path) do |row|
47
+ res = nil
48
+ if block_given?
49
+ res = yield row
50
+ end
51
+
52
+ if res.nil? == false
53
+ if(!validate || model.where(res).length == 0)
54
+ batch << res
55
+ end
56
+ end
57
+
58
+ if((batch.length % bulk_size) == 0)
59
+ # puts "Inserting batch '#{batch.inspect}'"
60
+
61
+ model.collection.insert(batch)
62
+ batch.clear
63
+ end
64
+ end
65
+
66
+ if batch.empty? == false
67
+ model.collection.insert(batch)
68
+ batch.clear
69
+ end
70
+ end
71
+
40
72
  end # Mongo
41
73
  end # module Helper
42
74
  end # module Apollo
@@ -29,9 +29,10 @@ module Apollo
29
29
  store_in collection: "domains"
30
30
 
31
31
  field :name
32
+ field :rank
32
33
 
33
34
  # Indexes
34
- index({ created_at: 1, updated_at: 1, name: 1 })
35
+ index({ created_at: 1, updated_at: 1, name: 1, rank: 1})
35
36
  end # class Domain
36
37
  end # module Model
37
38
  end # module Apollo
@@ -41,7 +41,8 @@ module Apollo
41
41
 
42
42
  # Declarations
43
43
  channel = amqp.create_channel
44
- self.declarations = Apollo::Agent.declare_exchanges(channel, opts)
44
+ # channel.prefetch(1)
45
+ self.declarations = Apollo::Agent.declare_entities(channel, opts)
45
46
 
46
47
  # Bindings
47
48
  declarations[:queues]["planner.fetched.queue"].bind(declarations[:exchanges]["planner.fetched"]).subscribe do |delivery_info, metadata, payload|
@@ -67,14 +68,17 @@ module Apollo
67
68
  end
68
69
  else
69
70
  doc = Apollo::Model::RawDocument.where(:sha_hash => response['sha_hash']).first
71
+ if(doc.nil? == false)
72
+ puts "Same as #{doc.inspect}"
73
+ end
70
74
  end
71
75
 
72
76
  if(doc.nil?)
73
77
  doc = Apollo::Model::RawDocument.new(response).save
74
- end
75
78
 
76
- # Publish
77
- declarations[:exchanges]["crawler"].publish(msg.to_json, :reply_to => "planner.crawled")
79
+ # Publish
80
+ declarations[:exchanges]["crawler"].publish(msg.to_json, :reply_to => "planner.crawled")
81
+ end
78
82
  end
79
83
 
80
84
  declarations[:queues]["planner.domained.queue"].bind(declarations[:exchanges]["planner.domained"]).subscribe do |delivery_info, metadata, payload|
@@ -103,6 +107,10 @@ module Apollo
103
107
  end
104
108
  end
105
109
 
110
+ def get_url_count(state, opts={})
111
+ Apollo::Model::QueuedUrl.where({:state => state}).count
112
+ end
113
+
106
114
  def fetch_url(url, opts={})
107
115
  if(opts[:verbose])
108
116
  puts "AMQP fetching '#{url.inspect}'"
@@ -113,19 +121,22 @@ module Apollo
113
121
  end
114
122
 
115
123
  def get_next_url(opts={})
116
- Apollo::Model::QueuedUrl.where({:state => :queued}).find_and_modify({ "$set" => { state: :fetching }}, new: true)
124
+ tmp = Apollo::Model::QueuedUrl.where({:state => :queued}).order_by(:created_at.asc)
125
+ tmp.find_and_modify({ "$set" => { state: :fetching }}, new: true)
117
126
  end
118
127
 
119
128
  def fetch_queued_urls(opts={})
120
- url = get_next_url(opts)
129
+ fetching_count = Apollo::Model::QueuedUrl.where({:state => :fetching}).count
121
130
 
122
- while url
123
- puts url.inspect
124
- # puts "Count of URLs in Queue: #{url.count}" if opts[:verbose]
131
+ if(fetching_count > 4)
132
+ puts "Fetching too many URLs. Slowing down for a while ..."
133
+ return
134
+ end
125
135
 
136
+ while get_url_count(:fetching) < 4
137
+ url = get_next_url(opts)
138
+ puts "SmartPlanner::fetch_queued_urls() - Queueing: #{url.inspect}"
126
139
  fetch_url(url, opts)
127
-
128
- url = get_next_url()
129
140
  end
130
141
  end
131
142
 
@@ -138,14 +138,16 @@ module Apollo
138
138
  end
139
139
 
140
140
  Thread::new {
141
- CSV.foreach(path) do |row|
141
+ Apollo::Helper::Mongo::csv_bulk_insert(path, Apollo::Model::Domain, 1000, false) do |row|
142
+ rank = row[0].to_i
142
143
  name = row[1]
143
- domain = Apollo::Model::Domain.where({:name => name}).first()
144
- if(domain.nil?)
145
- domain = Apollo::Model::Domain.new({:name => name})
146
- domain.save
147
- print "."
148
- end
144
+
145
+ res = {
146
+ :rank => rank,
147
+ :name => name
148
+ }
149
+
150
+ res
149
151
  end
150
152
  }
151
153
  end
@@ -18,7 +18,7 @@
18
18
  # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
19
  # THE SOFTWARE.
20
20
 
21
- require File.join(File.dirname(__FILE__), "../model/models")
21
+ require File.join(File.dirname(__FILE__), "../model/models.rb")
22
22
 
23
23
  module Apollo
24
24
  module Scheduler
@@ -26,13 +26,11 @@ module Apollo
26
26
  def self.schedule(url, crawler=nil, opts={})
27
27
  queued_url = Apollo::Model::QueuedUrl.where(:url => url).first
28
28
 
29
- if(queued_url.nil?)
30
- qu = Apollo::Model::QueuedUrl.new(:url => url, :state => :queued, :crawler_name => crawler.to_s)
31
- qu.save
32
- return qu
33
- end
34
-
35
- return nil
29
+ return queued_url if queued_url
30
+
31
+ res = Apollo::Model::QueuedUrl.new(:url => url, :state => :queued, :crawler_name => crawler.to_s)
32
+ res.save
33
+ return res
36
34
  end
37
35
  end # class BaseScheduler
38
36
  end # module Scheduler
@@ -19,5 +19,5 @@
19
19
  # THE SOFTWARE.
20
20
 
21
21
  module Apollo
22
- VERSION = '0.1.26'
22
+ VERSION = '0.1.27'
23
23
  end # Apollo
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: apollo-crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.26
4
+ version: 0.1.27
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tomas Korcak
@@ -391,6 +391,7 @@ dependencies:
391
391
  description: Gem for crawling data from external sources
392
392
  email: korczis@gmail.com
393
393
  executables:
394
+ - apollo-aws
394
395
  - apollo-console
395
396
  - apollo-crawler
396
397
  - apollo-platform
@@ -432,6 +433,7 @@ files:
432
433
  - ./lib/apollo_crawler/crawler/google_crawler.rb
433
434
  - ./lib/apollo_crawler/crawler/hacker_news_crawler.rb
434
435
  - ./lib/apollo_crawler/crawler/slashdot_crawler.rb
436
+ - ./lib/apollo_crawler/crawler/spider_crawler.rb
435
437
  - ./lib/apollo_crawler/crawler/stackoverflow_crawler.rb
436
438
  - ./lib/apollo_crawler/crawler/xkcd_crawler.rb
437
439
  - ./lib/apollo_crawler/crawler/youjizz_crawler.rb
@@ -473,6 +475,7 @@ files:
473
475
  - ./lib/apollo_crawler/store/base_store.rb
474
476
  - ./lib/apollo_crawler/store/stores.rb
475
477
  - ./lib/apollo_crawler/version.rb
478
+ - bin/apollo-aws
476
479
  - bin/apollo-console
477
480
  - bin/apollo-crawler
478
481
  - bin/apollo-platform