apollo-crawler 0.1.26 → 0.1.27

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 5e6fa7213e0e7f81364c5dbda6a8f53def1fda6e
4
- data.tar.gz: ecd79cd04f4a4331124b3910ef5e85da5f590692
3
+ metadata.gz: d6e559b9a330a556c3fb9eed416e0e83bdeb5d04
4
+ data.tar.gz: 289cf132fbd702c2c81d9517f95460aed5a629ee
5
5
  SHA512:
6
- metadata.gz: ae28d3adaca4125abdeee5ffbf73615a1efc6388e97882b67359846e81b6b58acca74fc08f59034c13f132892cca92b324876c20ab6680222aefc08b5e0d9ea1
7
- data.tar.gz: f452e71e138696125affbaa3ae646775857bf7b99a06a0656a85e08881e66fdbe357cd31eac4bc5f3030aadbf0075b4f602f280c669d65b139b14bc20ab974ec
6
+ metadata.gz: c150d48952a2061a2db796046016d0dc8cf0d3c858a41655995f3dea3b712cbc9470db75461a5598541313ea1de5a1325152579235a17f3c10911745eec7c532
7
+ data.tar.gz: 40bca94e6c247dcebd06fff4669dce6631a0c8096a3f26b4a3d68da7d03f1c1b9d0388fa067d46907291e64ec49d2ccd4f9c33fef3076c271816172f20d387bb
@@ -0,0 +1,187 @@
1
+ #! /usr/bin/env ruby
2
+ # encoding: utf-8
3
+
4
+ # Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
5
+ #
6
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ # of this software and associated documentation files (the "Software"), to deal
8
+ # in the Software without restriction, including without limitation the rights
9
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ # copies of the Software, and to permit persons to whom the Software is
11
+ # furnished to do so, subject to the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be included in
14
+ # all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
+ # THE SOFTWARE.
23
+
24
+ require "rubygems"
25
+ require "bundler/setup"
26
+
27
+ require 'fog'
28
+
29
+ require 'fileutils'
30
+ require 'yaml'
31
+
32
+ module Apollo
33
+ class AwsConsole
34
+ AWS_CONFIG_FILE = File.expand_path("~/.apollo/config/apollo-aws.yml")
35
+ AWS_DEFAULT_CONFIG = {
36
+ :provider => "aws",
37
+ :aws_access_key_id => "",
38
+ :aws_secret_access_key => "",
39
+ :aws_default_instance => "i-a4039bee",
40
+ :region => "eu-west-1"
41
+ }
42
+
43
+ attr_accessor :connection
44
+ attr_accessor :config
45
+
46
+ def initialize
47
+ self.connection = nil
48
+ self.config = nil
49
+ end
50
+
51
+ def config_init(path)
52
+ self.config = AWS_DEFAULT_CONFIG
53
+
54
+ AWS_DEFAULT_CONFIG.each do |k, v|
55
+ print "Override '#{k}' (default: '#{v}') ? > "
56
+ val = STDIN.gets.chomp!
57
+ if val.empty?
58
+ self.config[k] = v
59
+ else
60
+ self.config[k] = val
61
+ end
62
+ end
63
+
64
+ config_save(AWS_CONFIG_FILE, self.config)
65
+ end
66
+
67
+ def config_save(path, config)
68
+ dir = File.dirname(path)
69
+ if(Dir.exists?(dir) == false)
70
+ FileUtils.mkpath(dir)
71
+ end
72
+
73
+ File.open(path, 'w+') do |f|
74
+ f.write(config.to_yaml)
75
+ end
76
+
77
+ return config
78
+ end
79
+
80
+ def config_load(path)
81
+ self.config = YAML.load_file(path)
82
+ end
83
+
84
+ def connect()
85
+ return self.connection if self.connection
86
+
87
+ aws_provider_keys = [:provider, :aws_access_key_id, :aws_secret_access_key, :region]
88
+ config = self.config.reject { |key, value| !aws_provider_keys.include?(key) }
89
+
90
+ begin
91
+ self.connection = Fog::Compute.new(config)
92
+ rescue Exception => e
93
+ puts "Unable to connect to AWS, reason: #{e.to_s}"
94
+ return -1
95
+ end
96
+
97
+ return self.connection
98
+ end
99
+
100
+ def get_instance(instance_id)
101
+ puts "Getting instance '#{instance_id}'"
102
+ self.connection.servers.get(instance_id)
103
+ end
104
+
105
+ def run_cmd(cmd)
106
+ if cmd == "help"
107
+ puts "Supported commands - init, info, interactive, list, start, stop"
108
+ return 0
109
+ end
110
+
111
+ if self.config.nil?
112
+ if(File.exists?(AWS_CONFIG_FILE) == false)
113
+ self.config = config_init(AWS_CONFIG_FILE)
114
+ else
115
+ self.config = config_load(AWS_CONFIG_FILE)
116
+ end
117
+ end
118
+
119
+ case cmd
120
+
121
+ when "info"
122
+ instance = self.config[:aws_default_instance]
123
+ puts "Inspecting instance '#{instance}'"
124
+ connect()
125
+ server = get_instance(instance)
126
+ puts server.inspect
127
+ return 0
128
+
129
+ when "init"
130
+ config_init(AWS_CONFIG_FILE)
131
+ return 0
132
+
133
+ when "list"
134
+ connect()
135
+ instance_list = connection.servers.all
136
+ instance_list.table([:id, :flavor_id, :public_ip_address, :private_ip_address, :image_id ])
137
+ return 0
138
+
139
+ when "start"
140
+ instance = self.config[:aws_default_instance]
141
+ puts "Starting instance '#{instance}'"
142
+ connect()
143
+ server = get_instance(instance)
144
+ res = server.start
145
+ puts " => Success: #{res}"
146
+ puts " => DNS: #{server.dns_name}"
147
+ return 0
148
+
149
+ when "stop"
150
+ instance = self.config[:aws_default_instance]
151
+ puts "Stopping instance '#{instance}'"
152
+ connect()
153
+ server = get_instance(instance)
154
+ res = server.stop
155
+ puts " => Success: #{res}"
156
+ puts " => DNS: #{server.dns_name}"
157
+ return 0
158
+ end
159
+
160
+ puts "Unknown command '#{cmd}'"
161
+ return -1
162
+ end
163
+
164
+ def run()
165
+ cmd = ARGV.length > 0 ? ARGV[0] : "help"
166
+
167
+ if cmd == "interactive"
168
+ print "> "
169
+ while cmd = STDIN.gets.chomp!
170
+ break if cmd.downcase == "quit"
171
+
172
+ res = run_cmd(cmd)
173
+
174
+ print "> "
175
+ end
176
+
177
+ return 0
178
+ end
179
+
180
+ return run_cmd(cmd)
181
+ end
182
+ end
183
+ end
184
+
185
+ if __FILE__ == $0
186
+ Apollo::AwsConsole::new.run()
187
+ end
@@ -1,26 +1,23 @@
1
- default: &default_options
1
+ default:
2
2
  sessions:
3
3
  default:
4
4
  hosts:
5
5
  - apollo-crawler.no-ip.org:27017
6
-
7
6
  development:
8
7
  sessions:
9
8
  default:
10
9
  hosts:
11
10
  - localhost:27017
12
11
  database: apollo-crawler-development
13
-
14
12
  test:
15
13
  sessions:
16
14
  default:
17
- hosts:
18
- - apollo-crawler.no-ip.org:27017
19
- database: apollo-crawler-test
20
-
15
+ hosts:
16
+ - apollo-crawler.no-ip.org:27017
17
+ database: apollo-crawler-test
21
18
  production:
22
19
  sessions:
23
20
  default:
24
- hosts:
25
- - apollo-crawler.no-ip.org:27017
26
- database: apollo-crawler-production
21
+ hosts:
22
+ - apollo-crawler.no-ip.org:27017
23
+ database: apollo-crawler-production
@@ -37,18 +37,20 @@ module Apollo
37
37
 
38
38
  # Declarations
39
39
  channel = amqp.create_channel
40
- declarations = Apollo::Agent.declare_exchanges(channel, opts)
41
-
40
+ self.declarations = Apollo::Agent.declare_entities(channel, opts)# Binding
41
+
42
42
  # Binding
43
- declarations[:queues]["crawler.queue"].bind(declarations[:exchanges]["crawler"]).subscribe do |delivery_info, metadata, payload|
44
- puts "CrawlerAgent: Received, metadata #{metadata.inspect}" if opts[:verbose]
43
+ queue = self.declarations[:queues]["crawler.queue"]
44
+ exchange = self.declarations[:exchanges]["crawler"]
45
45
 
46
+ queue.bind(exchange).subscribe do |delivery_info, metadata, payload|
46
47
  msg = JSON.parse(payload)
47
48
 
48
49
  request = msg['request']
49
50
  response = msg['response']
51
+ url = request["url"]
50
52
 
51
- # puts "PLANEEEER: #{msg.inspect}"
53
+ puts "CrawlerAgent: Received - '#{url}', metadata #{metadata.inspect}" if opts[:verbose]
52
54
 
53
55
  doc = Nokogiri::HTML(response['body'])
54
56
  crawler = request['crawler_name'].constantize.new
@@ -59,7 +61,7 @@ module Apollo
59
61
  # puts res.inspect
60
62
 
61
63
  if(metadata[:reply_to] != nil)
62
- x = declarations[:exchanges][metadata[:reply_to]]
64
+ x = self.declarations[:exchanges][metadata[:reply_to]]
63
65
 
64
66
  msg = {
65
67
  :request => request,
@@ -37,7 +37,7 @@ module Apollo
37
37
 
38
38
  # Declarations
39
39
  channel = amqp.create_channel
40
- declarations = Apollo::Agent.declare_exchanges(channel, opts)
40
+ declarations = Apollo::Agent.declare_entities(channel, opts)
41
41
 
42
42
  # Binding
43
43
  declarations[:queues]["domainer.queue"].bind(declarations[:exchanges]["domainer"]).subscribe do |delivery_info, metadata, payload|
@@ -20,6 +20,23 @@
20
20
 
21
21
  module Apollo
22
22
  module Agent
23
+ def self.declare_queues(channel, opts={})
24
+ if(opts[:verbose])
25
+ puts "Declaring AMQP Queues"
26
+ end
27
+
28
+ # Queues
29
+ queues = {}
30
+ queues["crawler.queue"] = channel.queue("crawler.queue", :auto_delete => false, :durable => true)
31
+ queues["domainer.queue"] = channel.queue("domainer.queue", :auto_delete => false, :durable => true)
32
+ queues["fetcher.queue"] = channel.queue("fetcher.queue", :auto_delete => false, :durable => true)
33
+ queues["planner.crawled.queue"] = channel.queue("planner.crawled.queue", :auto_delete => false, :durable => true)
34
+ queues["planner.domained.queue"] = channel.queue("planner.domained.queue", :auto_delete => false, :durable => true)
35
+ queues["planner.fetched.queue"] = channel.queue("planner.fetched.queue", :auto_delete => false, :durable => true)
36
+
37
+ return queues
38
+ end
39
+
23
40
  def self.declare_exchanges(channel, opts={})
24
41
  if(opts[:verbose])
25
42
  puts "Declaring AMQP Exchanges"
@@ -34,14 +51,12 @@ module Apollo
34
51
  exchanges["planner.domained"] = channel.direct("planner.domained", :auto_delete => false, :durable => true)
35
52
  exchanges["planner.fetched"] = channel.direct("planner.fetched", :auto_delete => false, :durable => true)
36
53
 
37
- # Queues
38
- queues = {}
39
- queues["crawler.queue"] = channel.queue("crawler.queue", :auto_delete => false, :durable => true)
40
- queues["domainer.queue"] = channel.queue("domainer.queue", :auto_delete => false, :durable => true)
41
- queues["fetcher.queue"] = channel.queue("fetcher.queue", :auto_delete => false, :durable => true)
42
- queues["planner.crawled.queue"] = channel.queue("planner.crawled.queue", :auto_delete => false, :durable => true)
43
- queues["planner.domained.queue"] = channel.queue("planner.domained.queue", :auto_delete => false, :durable => true)
44
- queues["planner.fetched.queue"] = channel.queue("planner.fetched.queue", :auto_delete => false, :durable => true)
54
+ return exchanges
55
+ end
56
+
57
+ def self.declare_entities(channel, opts={})
58
+ exchanges = self.declare_exchanges(channel, opts)
59
+ queues = self.declare_queues(channel, opts)
45
60
 
46
61
  # Compose res
47
62
  res = {
@@ -23,17 +23,20 @@ require File.join(File.dirname(__FILE__), 'base_agent')
23
23
 
24
24
  require File.join(File.dirname(__FILE__), '../fetcher/fetchers')
25
25
 
26
+ require 'amqp'
27
+ require 'amqp/extensions/rabbitmq'
28
+
26
29
  require 'digest/sha1'
27
- require 'thread/pool'
30
+ require 'thread'
28
31
 
29
32
  module Apollo
30
33
  module Agent
31
34
  class FetcherAgent < BaseAgent
32
- THREAD_POOL_SIZE = 10
35
+ THREAD_POOL_SIZE = 1
33
36
 
34
37
  attr_accessor :fetcher
35
38
  attr_accessor :declarations
36
- attr_accessor :thread_pool
39
+ attr_accessor :mutex
37
40
 
38
41
  def initialize(amqp, opts={})
39
42
  self.fetcher = Apollo::Fetcher::SmartFetcher.new
@@ -42,46 +45,54 @@ module Apollo
42
45
  puts "Initializing fetcher agent..."
43
46
  end
44
47
 
45
- thread_pool = Thread::Pool.new(THREAD_POOL_SIZE)
46
-
47
48
  # Declarations
48
49
  channel = amqp.create_channel
49
- declarations = Apollo::Agent.declare_exchanges(channel, opts)
50
+ channel.prefetch(THREAD_POOL_SIZE)
51
+
52
+ # Binding (Default)
53
+ self.declarations = Apollo::Agent.declare_entities(channel, opts)
54
+ queue = declarations[:queues]["fetcher.queue"]
55
+
56
+ # AMQP contexts for threads
57
+ contexts = []
58
+ (0...THREAD_POOL_SIZE).each do |i|
59
+ puts "FetcherAgent::initialize() - Creating context #{i}" if opts[:verbose]
60
+ end
50
61
 
51
- # Binding
52
- declarations[:queues]["fetcher.queue"].bind(declarations[:exchanges]["fetcher"]).subscribe do |delivery_info, metadata, payload|
53
- thread_pool.process {
54
- queued_url = JSON.parse(payload)
55
- url = queued_url["url"]
62
+ # AMQP contexts mutex/lock
63
+ self.mutex = Mutex.new()
56
64
 
57
- puts "FetcherAgent: Received - '#{url}', metadata #{metadata.inspect}" if opts[:verbose]
65
+ exchange = self.declarations[:exchanges]["fetcher"]
58
66
 
59
- doc = nil
60
- begin
61
- doc = Apollo::Fetcher::SmartFetcher::fetch(url)
62
- rescue Exception => e
63
- puts "EXCEPTION: FetcherAgent::initialize() - Unable to fetch '#{url}', reason: '#{e.to_s}'"
64
- end
67
+ queue.bind(exchange).subscribe(:ack => true) do |delivery_info, metadata, payload|
68
+ # There can be troubles with concurency, please see https://groups.google.com/forum/?fromgroups=#!topic/ruby-amqp/aO9GPu-jxuE
69
+ queued_url = JSON.parse(payload)
70
+ url = queued_url["url"]
65
71
 
66
- doc = get_fetched_doc(queued_url, doc, metadata, opts)
67
-
68
- # send_response_msg(metadata[:reply_to], get_response_msg(queued_url, doc))
72
+ puts "FetcherAgent: Received - '#{url}', metadata #{metadata.inspect}" if opts[:verbose]
73
+ self.mutex.synchronize {
74
+ puts "FetcherAgent: Acking - '#{delivery_info.delivery_tag}'" if opts[:verbose]
75
+ channel.basic_ack(delivery_info.delivery_tag, true)
76
+ }
69
77
 
78
+ begin
79
+ doc = Apollo::Fetcher::SmartFetcher::fetch(url)
80
+ doc = process_fetched_doc(queued_url, doc, metadata, opts)
81
+
70
82
  if(metadata && metadata[:reply_to])
71
- exchange_name = metadata[:reply_to]
83
+ puts "Replying to '#{metadata[:reply_to]}'"
84
+ send_response_msg(metadata[:reply_to], queued_url, doc)
85
+ end
72
86
 
73
- if(exchange_name != nil)
74
- msg = get_response_msg(queued_url, doc)
87
+ rescue Exception => e
88
+ puts "EXCEPTION: FetcherAgent::initialize() - Unable to fetch '#{url}', reason: '#{e.to_s}'"
89
+ end
75
90
 
76
- x = declarations[:exchanges][exchange_name]
77
- x.publish(msg.to_json)
78
- end
79
- end
80
- }
91
+ doc
81
92
  end
82
93
  end
83
94
 
84
- def get_fetched_doc(queued_url, doc, metadata, opts={})
95
+ def process_fetched_doc(queued_url, doc, metadata, opts={})
85
96
  url = queued_url["url"]
86
97
 
87
98
  res = Apollo::Model::RawDocument.new
@@ -94,12 +105,23 @@ module Apollo
94
105
  return res
95
106
  end
96
107
 
97
- def get_response_msg(queued_url, doc)
108
+ def format_response_msg(queued_url, doc)
98
109
  return {
99
110
  :request => queued_url,
100
111
  :response => doc
101
112
  }
102
113
  end
114
+
115
+ def send_response_msg(dest, queued_url, doc)
116
+ if(dest != nil)
117
+ msg = format_response_msg(queued_url, doc)
118
+
119
+ self.mutex.synchronize {
120
+ exchange = self.declarations[:exchanges][dest]
121
+ exchange.publish(msg.to_json)
122
+ }
123
+ end
124
+ end
103
125
  end # class FetcherAgent
104
126
  end # module Agent
105
127
  end # module Apollo
@@ -22,6 +22,7 @@ require File.join(File.dirname(__FILE__), 'base_crawler')
22
22
  require File.join(File.dirname(__FILE__), 'google_crawler')
23
23
  require File.join(File.dirname(__FILE__), 'hacker_news_crawler')
24
24
  require File.join(File.dirname(__FILE__), 'slashdot_crawler')
25
+ require File.join(File.dirname(__FILE__), 'spider_crawler')
25
26
  require File.join(File.dirname(__FILE__), 'stackoverflow_crawler')
26
27
  require File.join(File.dirname(__FILE__), 'xkcd_crawler')
27
28
  require File.join(File.dirname(__FILE__), 'youjizz_crawler')
@@ -0,0 +1,52 @@
1
+ # Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
2
+ #
3
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ # of this software and associated documentation files (the "Software"), to deal
5
+ # in the Software without restriction, including without limitation the rights
6
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ # copies of the Software, and to permit persons to whom the Software is
8
+ # furnished to do so, subject to the following conditions:
9
+ #
10
+ # The above copyright notice and this permission notice shall be included in
11
+ # all copies or substantial portions of the Software.
12
+ #
13
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
+ # THE SOFTWARE.
20
+
21
+ require File.join(File.dirname(__FILE__), 'base_crawler')
22
+
23
+ module Apollo
24
+ module Crawler
25
+ class SpiderCrawler < BaseCrawler
26
+ def name()
27
+ return "Spider"
28
+ end
29
+
30
+ def url()
31
+ return "http://www.wikipedia.org/"
32
+ end
33
+
34
+ def extract_data(doc)
35
+ []
36
+ end
37
+
38
+ def extract_links(doc)
39
+ res = doc.xpath("//a").map { |node|
40
+ url = BaseCrawler.try_get_url(self.url, node['href']).to_s
41
+ next if url.nil?
42
+
43
+ {
44
+ :link => url
45
+ }
46
+ }
47
+
48
+ return res.uniq
49
+ end
50
+ end # class SpiderCrawler
51
+ end # module Crawler
52
+ end # module Apollo
@@ -19,6 +19,7 @@
19
19
  # THE SOFTWARE.
20
20
 
21
21
  require 'amqp'
22
+ require 'amqp/extensions/rabbitmq'
22
23
  require 'bunny'
23
24
  require 'thread'
24
25
 
@@ -21,6 +21,8 @@
21
21
  require 'mongo'
22
22
  require 'mongoid'
23
23
 
24
+ require 'csv'
25
+
24
26
  module Apollo
25
27
  module Helper
26
28
  module Mongo
@@ -37,6 +39,36 @@ module Apollo
37
39
 
38
40
  return res
39
41
  end
42
+
43
+ def self.csv_bulk_insert(path, model, bulk_size, validate=false, &block)
44
+ batch = []
45
+
46
+ CSV.foreach(path) do |row|
47
+ res = nil
48
+ if block_given?
49
+ res = yield row
50
+ end
51
+
52
+ if res.nil? == false
53
+ if(!validate || model.where(res).length == 0)
54
+ batch << res
55
+ end
56
+ end
57
+
58
+ if((batch.length % bulk_size) == 0)
59
+ # puts "Inserting batch '#{batch.inspect}'"
60
+
61
+ model.collection.insert(batch)
62
+ batch.clear
63
+ end
64
+ end
65
+
66
+ if batch.empty? == false
67
+ model.collection.insert(batch)
68
+ batch.clear
69
+ end
70
+ end
71
+
40
72
  end # Mongo
41
73
  end # module Helper
42
74
  end # module Apollo
@@ -29,9 +29,10 @@ module Apollo
29
29
  store_in collection: "domains"
30
30
 
31
31
  field :name
32
+ field :rank
32
33
 
33
34
  # Indexes
34
- index({ created_at: 1, updated_at: 1, name: 1 })
35
+ index({ created_at: 1, updated_at: 1, name: 1, rank: 1})
35
36
  end # class Domain
36
37
  end # module Model
37
38
  end # module Apollo
@@ -41,7 +41,8 @@ module Apollo
41
41
 
42
42
  # Declarations
43
43
  channel = amqp.create_channel
44
- self.declarations = Apollo::Agent.declare_exchanges(channel, opts)
44
+ # channel.prefetch(1)
45
+ self.declarations = Apollo::Agent.declare_entities(channel, opts)
45
46
 
46
47
  # Bindings
47
48
  declarations[:queues]["planner.fetched.queue"].bind(declarations[:exchanges]["planner.fetched"]).subscribe do |delivery_info, metadata, payload|
@@ -67,14 +68,17 @@ module Apollo
67
68
  end
68
69
  else
69
70
  doc = Apollo::Model::RawDocument.where(:sha_hash => response['sha_hash']).first
71
+ if(doc.nil? == false)
72
+ puts "Same as #{doc.inspect}"
73
+ end
70
74
  end
71
75
 
72
76
  if(doc.nil?)
73
77
  doc = Apollo::Model::RawDocument.new(response).save
74
- end
75
78
 
76
- # Publish
77
- declarations[:exchanges]["crawler"].publish(msg.to_json, :reply_to => "planner.crawled")
79
+ # Publish
80
+ declarations[:exchanges]["crawler"].publish(msg.to_json, :reply_to => "planner.crawled")
81
+ end
78
82
  end
79
83
 
80
84
  declarations[:queues]["planner.domained.queue"].bind(declarations[:exchanges]["planner.domained"]).subscribe do |delivery_info, metadata, payload|
@@ -103,6 +107,10 @@ module Apollo
103
107
  end
104
108
  end
105
109
 
110
+ def get_url_count(state, opts={})
111
+ Apollo::Model::QueuedUrl.where({:state => state}).count
112
+ end
113
+
106
114
  def fetch_url(url, opts={})
107
115
  if(opts[:verbose])
108
116
  puts "AMQP fetching '#{url.inspect}'"
@@ -113,19 +121,22 @@ module Apollo
113
121
  end
114
122
 
115
123
  def get_next_url(opts={})
116
- Apollo::Model::QueuedUrl.where({:state => :queued}).find_and_modify({ "$set" => { state: :fetching }}, new: true)
124
+ tmp = Apollo::Model::QueuedUrl.where({:state => :queued}).order_by(:created_at.asc)
125
+ tmp.find_and_modify({ "$set" => { state: :fetching }}, new: true)
117
126
  end
118
127
 
119
128
  def fetch_queued_urls(opts={})
120
- url = get_next_url(opts)
129
+ fetching_count = Apollo::Model::QueuedUrl.where({:state => :fetching}).count
121
130
 
122
- while url
123
- puts url.inspect
124
- # puts "Count of URLs in Queue: #{url.count}" if opts[:verbose]
131
+ if(fetching_count > 4)
132
+ puts "Fetching too many URLs. Slowing down for a while ..."
133
+ return
134
+ end
125
135
 
136
+ while get_url_count(:fetching) < 4
137
+ url = get_next_url(opts)
138
+ puts "SmartPlanner::fetch_queued_urls() - Queueing: #{url.inspect}"
126
139
  fetch_url(url, opts)
127
-
128
- url = get_next_url()
129
140
  end
130
141
  end
131
142
 
@@ -138,14 +138,16 @@ module Apollo
138
138
  end
139
139
 
140
140
  Thread::new {
141
- CSV.foreach(path) do |row|
141
+ Apollo::Helper::Mongo::csv_bulk_insert(path, Apollo::Model::Domain, 1000, false) do |row|
142
+ rank = row[0].to_i
142
143
  name = row[1]
143
- domain = Apollo::Model::Domain.where({:name => name}).first()
144
- if(domain.nil?)
145
- domain = Apollo::Model::Domain.new({:name => name})
146
- domain.save
147
- print "."
148
- end
144
+
145
+ res = {
146
+ :rank => rank,
147
+ :name => name
148
+ }
149
+
150
+ res
149
151
  end
150
152
  }
151
153
  end
@@ -18,7 +18,7 @@
18
18
  # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
19
  # THE SOFTWARE.
20
20
 
21
- require File.join(File.dirname(__FILE__), "../model/models")
21
+ require File.join(File.dirname(__FILE__), "../model/models.rb")
22
22
 
23
23
  module Apollo
24
24
  module Scheduler
@@ -26,13 +26,11 @@ module Apollo
26
26
  def self.schedule(url, crawler=nil, opts={})
27
27
  queued_url = Apollo::Model::QueuedUrl.where(:url => url).first
28
28
 
29
- if(queued_url.nil?)
30
- qu = Apollo::Model::QueuedUrl.new(:url => url, :state => :queued, :crawler_name => crawler.to_s)
31
- qu.save
32
- return qu
33
- end
34
-
35
- return nil
29
+ return queued_url if queued_url
30
+
31
+ res = Apollo::Model::QueuedUrl.new(:url => url, :state => :queued, :crawler_name => crawler.to_s)
32
+ res.save
33
+ return res
36
34
  end
37
35
  end # class BaseScheduler
38
36
  end # module Scheduler
@@ -19,5 +19,5 @@
19
19
  # THE SOFTWARE.
20
20
 
21
21
  module Apollo
22
- VERSION = '0.1.26'
22
+ VERSION = '0.1.27'
23
23
  end # Apollo
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: apollo-crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.26
4
+ version: 0.1.27
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tomas Korcak
@@ -391,6 +391,7 @@ dependencies:
391
391
  description: Gem for crawling data from external sources
392
392
  email: korczis@gmail.com
393
393
  executables:
394
+ - apollo-aws
394
395
  - apollo-console
395
396
  - apollo-crawler
396
397
  - apollo-platform
@@ -432,6 +433,7 @@ files:
432
433
  - ./lib/apollo_crawler/crawler/google_crawler.rb
433
434
  - ./lib/apollo_crawler/crawler/hacker_news_crawler.rb
434
435
  - ./lib/apollo_crawler/crawler/slashdot_crawler.rb
436
+ - ./lib/apollo_crawler/crawler/spider_crawler.rb
435
437
  - ./lib/apollo_crawler/crawler/stackoverflow_crawler.rb
436
438
  - ./lib/apollo_crawler/crawler/xkcd_crawler.rb
437
439
  - ./lib/apollo_crawler/crawler/youjizz_crawler.rb
@@ -473,6 +475,7 @@ files:
473
475
  - ./lib/apollo_crawler/store/base_store.rb
474
476
  - ./lib/apollo_crawler/store/stores.rb
475
477
  - ./lib/apollo_crawler/version.rb
478
+ - bin/apollo-aws
476
479
  - bin/apollo-console
477
480
  - bin/apollo-crawler
478
481
  - bin/apollo-platform