apollo-crawler 0.1.11 → 0.1.12

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- M2EwYTQzNjFlZWY1YWFhOTNkYjAyNzYzODM1ZWRiYWRkOWQzYzk5MQ==
4
+ MjkxYjlmY2NjMDYwODcxN2JmMzA1MDM3NzM5NzQ1ZWVhNDNiYWQ0MQ==
5
5
  data.tar.gz: !binary |-
6
- YTU3NzEwMzQ1OGYyNGNkMDM5YjdmMTA0ODA3OGI0ZTQ4YjMxNTQwZA==
6
+ N2ViMTdhNmQ1OGM3ZTczYjIxZWU1Y2NlY2NlYWMxMDM1MDkwZjBjYg==
7
7
  !binary "U0hBNTEy":
8
8
  metadata.gz: !binary |-
9
- OTYxMWI5ZmFmYTg3NWU5ZGI2YjcwY2UzN2UxNzIwN2E0OTU3YmEzNGNlMmQ0
10
- YmE5NmNmY2MxNjMyNmY5MzgxNjc3ZWIxMzY1ZjY5ZTVkNjFmMzE5ZWQxZGI0
11
- YTY1YzcyMDc4Y2MxN2FlYmZiZTY1ZGI2YjQxOTViN2ZiY2M3MjQ=
9
+ NmNiNjMzZGFjY2ZmOTNjOTU4NTAxMGQzODZlNGYyOWI2MGQzY2YwM2Q4ZmMw
10
+ NDMzYzM2OTNkYmU2MzJiYzNhYzMwNGNmZDI0OWZiM2ZiZjJiYjFkMWExY2Rh
11
+ NTA3MDkxNTA1OTk1NWE5ZWMyNGFiZjY5ODhiMDMxZDU5NjgwZDU=
12
12
  data.tar.gz: !binary |-
13
- YzYwZTdmMTUxMWZiY2ZiZTBhYzU4OTUxZmQxNTE2NTYyY2Q0YThmNTA1OWU4
14
- NDc4YTMzNTBhNzU4ODIwNzVlZDcyYTlmNjBlODE0MDM2YTI4ZWY4ZWY3NzMw
15
- ODM1MjExYjE1YmYzNDAwY2I0NmIzZDVhMDQyYTY4ODk5NzVjZjI=
13
+ MWI1YWI3NjcwYzQ1NWVkZDI2ZjM3NjY5MGRjMzZkMDQxMThlMWE2MGU3MzAx
14
+ ZWE5ZWIyYWExNjdjMjYyYjUxNTU5MWZlNGI5MWUzOWYwZGI2NjQ2YTNkMTIy
15
+ YmRiNzIzNGY5ZThlNTdkMzIwODJkNjc2ZWUyNzQ5MWNlOWZlM2I=
@@ -29,6 +29,7 @@ require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/base_cache')
29
29
  require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/factory')
30
30
  require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/memcached_cache')
31
31
  require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/memory_cache')
32
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/mongo_cache')
32
33
  require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/null_cache')
33
34
 
34
35
  # Crawlers
@@ -0,0 +1,54 @@
1
+ # Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
2
+ #
3
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ # of this software and associated documentation files (the "Software"), to deal
5
+ # in the Software without restriction, including without limitation the rights
6
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ # copies of the Software, and to permit persons to whom the Software is
8
+ # furnished to do so, subject to the following conditions:
9
+ #
10
+ # The above copyright notice and this permission notice shall be included in
11
+ # all copies or substantial portions of the Software.
12
+ #
13
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
+ # THE SOFTWARE.
20
+
21
+ require File.join(File.dirname(__FILE__), 'base_cache')
22
+
23
+ require 'mongo'
24
+
25
+ module Apollo
26
+ module Cache
27
+ class MongoCache < BaseCache
28
+ def initialize
29
+ @mongo_client = Mongo::MongoClient.new('localhost', 27017, :pool_size => 5, :pool_timeout => 5)
30
+ @db = @mongo_client['apollo-crawler']
31
+ end
32
+
33
+ # Get value associated with key from cache
34
+ def get(key, *args)
35
+ res = @db['docs'].find({:url => key})
36
+
37
+ # Not found, Create, cache and return
38
+ if res.nil? || res.count < 1 && block_given?
39
+ res = yield args
40
+ return self.set(key, res)
41
+ end
42
+
43
+ return res.to_a[0]
44
+ end
45
+
46
+ # Set value associated with key
47
+ # Return cached value
48
+ def set(key, value)
49
+ @db['docs'].insert(value)
50
+ return value
51
+ end
52
+ end # MongoCache
53
+ end # Cache
54
+ end # Apollo
@@ -150,7 +150,6 @@ module Apollo
150
150
  # Format ETL result
151
151
  res = {
152
152
  :crawler => self.class.name,
153
- :title => doc.title,
154
153
  :data => data,
155
154
  :links => links
156
155
  }
@@ -158,6 +157,17 @@ module Apollo
158
157
  return res
159
158
  end
160
159
 
160
+ def self.create_metadoc(url, doc)
161
+ return {
162
+ 'url' => url,
163
+ 'doc' => doc.encode('utf-8'),
164
+ 'hash' => Digest::SHA256.new.update(doc).hexdigest,
165
+ 'created_at' => Time.now.utc,
166
+ 'expires_at' => nil,
167
+ 'version' => 0
168
+ }
169
+ end
170
+
161
171
  # Fetch document
162
172
  def fetch_document(url)
163
173
  # TODO: Refactor following idiom
@@ -171,15 +181,15 @@ module Apollo
171
181
 
172
182
  # TODO: Use some (custom-made) low-level HTTTP Protocol cache - just for sure
173
183
  cache = Apollo::Cache::Factory.instance.construct
174
- raw = cache.get(url) do
184
+ metadoc = cache.get(url) do
175
185
  max_attempts = 3
176
186
  attempt_no = 0
177
187
  success = false
178
188
 
179
- res = nil
189
+ doc = nil
180
190
  while(attempt_no < max_attempts && success == false) do
181
191
  begin
182
- res = BaseCrawler.fetch(url)
192
+ doc = BaseCrawler.fetch(url)
183
193
  success = true
184
194
  rescue Exception => e
185
195
  puts "EXCEPTION: Unable to fetch '#{url}', reason: '#{e.to_s}'"
@@ -190,12 +200,12 @@ module Apollo
190
200
  end
191
201
  end
192
202
 
193
- res
203
+ # Create metadata
204
+ BaseCrawler.create_metadoc(url, doc)
194
205
  end
195
206
 
196
207
  # TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
197
- doc = Nokogiri::HTML(raw)
198
- return doc
208
+ return Nokogiri::HTML(metadoc['doc'])
199
209
  end
200
210
 
201
211
  # Extracts data from document
@@ -18,14 +18,16 @@
18
18
  # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
19
  # THE SOFTWARE.
20
20
 
21
+ require "open-uri"
22
+ require "nokogiri"
23
+
21
24
  module Apollo
22
25
  module Fetcher
23
26
  class BaseFetcher
24
27
  def self.fetch(url)
25
28
  # TODO: Throw exception ???
26
- return nil
27
- end
28
-
29
+ return open(url).read
30
+ end
29
31
  end # class BaseFetcher
30
32
  end # module Fetcher
31
33
  end # module Apollo
@@ -28,7 +28,7 @@ module Apollo
28
28
  class SimpleFetcher < BaseFetcher
29
29
  def self.fetch(url)
30
30
  # TODO: Throw exception ???
31
- return open(url).read
31
+ return BaseFetcher::fetch(url)
32
32
  end
33
33
  end # class SimpleFetcher
34
34
  end # module Fetcher
@@ -38,11 +38,11 @@ module Apollo
38
38
  sleep(diff)
39
39
  end
40
40
 
41
- res = open(url).read
41
+ res = BaseFetcher::fetch(url)
42
42
 
43
43
  @@LAST_FETCH = DateTime.now
44
44
  return res
45
45
  end
46
46
  end # class SimpleFetcher
47
47
  end # module SmartFetcher
48
- end # module Apollo
48
+ end # module Apollo
@@ -26,6 +26,7 @@ require File.join(File.dirname(__FILE__), 'cache/base_cache')
26
26
  require File.join(File.dirname(__FILE__), 'cache/factory')
27
27
  require File.join(File.dirname(__FILE__), 'cache/memcached_cache')
28
28
  require File.join(File.dirname(__FILE__), 'cache/memory_cache')
29
+ require File.join(File.dirname(__FILE__), 'cache/mongo_cache')
29
30
  require File.join(File.dirname(__FILE__), 'cache/null_cache')
30
31
 
31
32
  # Crawlers
@@ -65,18 +65,23 @@ module Apollo
65
65
  # Initialize command-line options
66
66
  def init_options
67
67
  @options = {}
68
+
68
69
  @options[:doc_limit] = nil
69
70
  @options[:verbose] = false
70
71
  @options[:version] = false
72
+
71
73
  @options[:cache_dirs] = [
72
74
  RbConfig::CACHES_DIR
73
75
  ]
76
+
74
77
  @options[:crawler_dirs] = [
75
78
  RbConfig::CRAWLERS_DIR
76
79
  ]
80
+
77
81
  @options[:formatter_dirs] = [
78
82
  RbConfig::FORMATTERS_DIR
79
83
  ]
84
+
80
85
  @options[:generate_crawler] = nil
81
86
 
82
87
  @optparser = OptionParser.new do | opts |
@@ -139,6 +144,56 @@ module Apollo
139
144
  @optparser.parse!(args)
140
145
  end
141
146
 
147
+ def process_options(args)
148
+ if(@options[:version])
149
+ puts Apollo::VERSION
150
+ return 0
151
+ end
152
+
153
+ if(@options[:show_help])
154
+ puts @optparser
155
+ return 0
156
+ end
157
+
158
+ if(@options[:generate_crawler])
159
+ name = @options[:generate_crawler]
160
+ url = args.length > 0 ? args[0] : nil
161
+ matcher = args.length > 1 ? args[1] : nil
162
+
163
+ return self.generate_crawler(name, url, matcher)
164
+ end
165
+
166
+ if(@options[:list_formatters])
167
+ list_formatters()
168
+ return 0
169
+ end
170
+
171
+ if(@options[:list_crawlers])
172
+ list_crawlers()
173
+ return 0
174
+ end
175
+
176
+ return nil
177
+ end
178
+
179
+ def init_formatter()
180
+ # Set default formatter here
181
+ formatter_name = "json"
182
+ if(@options[:formatter])
183
+ formatter_name = @options[:formatter]
184
+ end
185
+
186
+ # Look for specified formatter
187
+ f = @formatters.select { |k, v|
188
+ name = formatter_name.gsub(Apollo::Formatter::BaseFormatter::name_re, "")
189
+ k.downcase == name
190
+ }
191
+
192
+ if(f)
193
+ @formatter = f[f.keys[0]]
194
+ end
195
+ end
196
+
142
197
  # Load global options first
143
198
  # Merge it with local options (if they exists)
144
199
  def load_config_file()
@@ -328,96 +383,31 @@ module Apollo
328
383
  return 0
329
384
  end
330
385
 
386
+ # Show tabular data in form of CLI table
331
387
  def self.console_table(headings, rows)
332
388
  table = Terminal::Table.new :headings => headings, :rows => rows
333
389
  puts table
334
390
  end
335
391
 
392
+ # List available crawlers
336
393
  def list_crawlers()
337
394
  CrawlerProgram.console_table(['name', 'class'], @crawlers)
338
395
  return
339
396
  end
340
397
 
398
+ # List available formatters
341
399
  def list_formatters()
342
400
  CrawlerProgram.console_table(['name', 'class'], @formatters)
343
401
  return
344
402
  end
345
403
 
346
- def run(args = ARGV)
347
- # puts "#{ARGV.inspect}"
348
-
349
- init_options()
350
-
351
- parse_options(args)
352
-
353
- if(@options[:version])
354
- puts Apollo::VERSION
355
- return 0
356
- end
357
-
358
- if(@options[:show_help])
359
- puts @optparser
360
- return 0
361
- end
362
-
363
- load_config_file()
364
-
365
- if(@options[:generate_crawler])
366
- name = @options[:generate_crawler]
367
- url = args.length > 0 ? args[0] : nil
368
- matcher = args.length > 1 ? args[1] : nil
369
-
370
- return self.generate_crawler(name, url, matcher)
371
- end
372
-
373
- register_modules()
374
-
375
- # Set default formatter here
376
- formatter_name = "json"
377
- if(@options[:formatter])
378
- formatter_name = @options[:formatter]
379
- end
380
-
381
- # Look for specified formatter
382
- f = @formatters.select { |k, v|
383
- name = formatter_name.gsub(Apollo::Formatter::BaseFormatter::name_re, "")
384
- k.downcase == name
385
- }
386
-
387
- if(f)
388
- @formatter = f[f.keys[0]]
389
- end
390
-
391
- if(@options[:list_formatters])
392
- list_formatters()
393
- return 0
394
- end
395
-
396
- if(@options[:list_crawlers])
397
- list_crawlers()
398
- return 0
399
- end
400
-
401
- crawlers = []
402
- if(args.length > 0)
403
- crawlers << args.shift
404
- end
405
-
406
- if(@options[:run_all])
407
- crawlers = @crawlers.keys
408
- end
409
-
410
- if(crawlers.empty?)
411
- puts @optparser
412
- return 0
413
- end
414
-
415
- crawlers.each do |crawler|
416
- crawler_name = crawler.downcase.gsub(Apollo::Crawler::BaseCrawler.name_re, "")
404
+ def run_crawlers(crawlers, args)
405
+ crawlers.each do |name|
406
+ crawler_name = name.downcase.gsub(Apollo::Crawler::BaseCrawler.name_re, "")
417
407
 
418
- p = @crawlers[crawler_name]
419
- if(p == nil)
420
- puts "Invalid crawler name - '#{crawler}'"
408
+ crawler = @crawlers[crawler_name]
409
+ if(crawler == nil)
410
+ puts "Invalid crawler name - '#{name}'"
421
411
  puts "See program help"
422
412
  return 0
423
413
  end
@@ -430,7 +420,7 @@ module Apollo
430
420
  :doc_limit => @options[:doc_limit]
431
421
  }
432
422
 
433
- res = p.new.etl(args, opts) { | docs |
423
+ res = crawler.new.etl(args, opts) { | docs |
434
424
  if(docs.nil?)
435
425
  next
436
426
  end
@@ -447,5 +437,50 @@ module Apollo
447
437
 
448
438
  return 0
449
439
  end
440
+
441
+ # Get crawlers passd to cmd-line
442
+ def get_crawlers(args)
443
+ crawlers = []
444
+ if(args.length > 0)
445
+ crawlers << args.shift
446
+ end
447
+
448
+ if(@options[:run_all])
449
+ crawlers = @crawlers.keys
450
+ end
451
+
452
+ return crawlers
453
+ end
454
+
455
+ # Init program
456
+ def init_program(args)
457
+ init_options()
458
+
459
+ parse_options(args)
460
+
461
+ load_config_file()
462
+
463
+ register_modules()
464
+
465
+ res = process_options(args)
466
+ if res != nil
467
+ return res
468
+ end
469
+
470
+ init_formatter()
471
+ end
472
+
473
+ # Run Program
474
+ def run(args = ARGV)
475
+ init_program(args)
476
+
477
+ crawlers = get_crawlers(args)
478
+ if(crawlers.empty?)
479
+ puts @optparser
480
+ return 0
481
+ end
482
+
483
+ return run_crawlers(crawlers, args)
484
+ end
450
485
  end
451
486
  end
@@ -19,5 +19,5 @@
19
19
  # THE SOFTWARE.
20
20
 
21
21
  module Apollo
22
- VERSION = '0.1.11'
22
+ VERSION = '0.1.12'
23
23
  end # Apollo
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: apollo-crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.11
4
+ version: 0.1.12
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tomas Korcak
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-02-28 00:00:00.000000000 Z
11
+ date: 2013-03-03 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: amqp
@@ -80,6 +80,20 @@ dependencies:
80
80
  - - ~>
81
81
  - !ruby/object:Gem::Version
82
82
  version: 1.5.5
83
+ - !ruby/object:Gem::Dependency
84
+ name: ffi
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ! '>='
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ! '>='
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
83
97
  - !ruby/object:Gem::Dependency
84
98
  name: eventmachine
85
99
  requirement: !ruby/object:Gem::Requirement
@@ -136,6 +150,34 @@ dependencies:
136
150
  - - ! '>='
137
151
  - !ruby/object:Gem::Version
138
152
  version: '0'
153
+ - !ruby/object:Gem::Dependency
154
+ name: mongo
155
+ requirement: !ruby/object:Gem::Requirement
156
+ requirements:
157
+ - - ! '>='
158
+ - !ruby/object:Gem::Version
159
+ version: '0'
160
+ type: :runtime
161
+ prerelease: false
162
+ version_requirements: !ruby/object:Gem::Requirement
163
+ requirements:
164
+ - - ! '>='
165
+ - !ruby/object:Gem::Version
166
+ version: '0'
167
+ - !ruby/object:Gem::Dependency
168
+ name: mongoid
169
+ requirement: !ruby/object:Gem::Requirement
170
+ requirements:
171
+ - - ! '>='
172
+ - !ruby/object:Gem::Version
173
+ version: '0'
174
+ type: :runtime
175
+ prerelease: false
176
+ version_requirements: !ruby/object:Gem::Requirement
177
+ requirements:
178
+ - - ! '>='
179
+ - !ruby/object:Gem::Version
180
+ version: '0'
139
181
  - !ruby/object:Gem::Dependency
140
182
  name: mime-types
141
183
  requirement: !ruby/object:Gem::Requirement
@@ -220,6 +262,48 @@ dependencies:
220
262
  - - ! '>='
221
263
  - !ruby/object:Gem::Version
222
264
  version: '0'
265
+ - !ruby/object:Gem::Dependency
266
+ name: guard
267
+ requirement: !ruby/object:Gem::Requirement
268
+ requirements:
269
+ - - ! '>='
270
+ - !ruby/object:Gem::Version
271
+ version: '0'
272
+ type: :development
273
+ prerelease: false
274
+ version_requirements: !ruby/object:Gem::Requirement
275
+ requirements:
276
+ - - ! '>='
277
+ - !ruby/object:Gem::Version
278
+ version: '0'
279
+ - !ruby/object:Gem::Dependency
280
+ name: guard-rake
281
+ requirement: !ruby/object:Gem::Requirement
282
+ requirements:
283
+ - - ! '>='
284
+ - !ruby/object:Gem::Version
285
+ version: '0'
286
+ type: :development
287
+ prerelease: false
288
+ version_requirements: !ruby/object:Gem::Requirement
289
+ requirements:
290
+ - - ! '>='
291
+ - !ruby/object:Gem::Version
292
+ version: '0'
293
+ - !ruby/object:Gem::Dependency
294
+ name: guard-rspec
295
+ requirement: !ruby/object:Gem::Requirement
296
+ requirements:
297
+ - - ! '>='
298
+ - !ruby/object:Gem::Version
299
+ version: '0'
300
+ type: :development
301
+ prerelease: false
302
+ version_requirements: !ruby/object:Gem::Requirement
303
+ requirements:
304
+ - - ! '>='
305
+ - !ruby/object:Gem::Version
306
+ version: '0'
223
307
  description: Gem for crawling data from external sources
224
308
  email: korczis@gmail.com
225
309
  executables:
@@ -240,6 +324,7 @@ files:
240
324
  - ./lib/apollo_crawler/cache/null_cache.rb
241
325
  - ./lib/apollo_crawler/cache/memory_cache.rb
242
326
  - ./lib/apollo_crawler/cache/base_cache.rb
327
+ - ./lib/apollo_crawler/cache/mongo_cache.rb
243
328
  - ./lib/apollo_crawler/cache/memcached_cache.rb
244
329
  - ./lib/apollo_crawler/crawler/xkcd_crawler.rb
245
330
  - ./lib/apollo_crawler/crawler/google_crawler.rb