apollo-crawler 0.1.11 → 0.1.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- M2EwYTQzNjFlZWY1YWFhOTNkYjAyNzYzODM1ZWRiYWRkOWQzYzk5MQ==
4
+ MjkxYjlmY2NjMDYwODcxN2JmMzA1MDM3NzM5NzQ1ZWVhNDNiYWQ0MQ==
5
5
  data.tar.gz: !binary |-
6
- YTU3NzEwMzQ1OGYyNGNkMDM5YjdmMTA0ODA3OGI0ZTQ4YjMxNTQwZA==
6
+ N2ViMTdhNmQ1OGM3ZTczYjIxZWU1Y2NlY2NlYWMxMDM1MDkwZjBjYg==
7
7
  !binary "U0hBNTEy":
8
8
  metadata.gz: !binary |-
9
- OTYxMWI5ZmFmYTg3NWU5ZGI2YjcwY2UzN2UxNzIwN2E0OTU3YmEzNGNlMmQ0
10
- YmE5NmNmY2MxNjMyNmY5MzgxNjc3ZWIxMzY1ZjY5ZTVkNjFmMzE5ZWQxZGI0
11
- YTY1YzcyMDc4Y2MxN2FlYmZiZTY1ZGI2YjQxOTViN2ZiY2M3MjQ=
9
+ NmNiNjMzZGFjY2ZmOTNjOTU4NTAxMGQzODZlNGYyOWI2MGQzY2YwM2Q4ZmMw
10
+ NDMzYzM2OTNkYmU2MzJiYzNhYzMwNGNmZDI0OWZiM2ZiZjJiYjFkMWExY2Rh
11
+ NTA3MDkxNTA1OTk1NWE5ZWMyNGFiZjY5ODhiMDMxZDU5NjgwZDU=
12
12
  data.tar.gz: !binary |-
13
- YzYwZTdmMTUxMWZiY2ZiZTBhYzU4OTUxZmQxNTE2NTYyY2Q0YThmNTA1OWU4
14
- NDc4YTMzNTBhNzU4ODIwNzVlZDcyYTlmNjBlODE0MDM2YTI4ZWY4ZWY3NzMw
15
- ODM1MjExYjE1YmYzNDAwY2I0NmIzZDVhMDQyYTY4ODk5NzVjZjI=
13
+ MWI1YWI3NjcwYzQ1NWVkZDI2ZjM3NjY5MGRjMzZkMDQxMThlMWE2MGU3MzAx
14
+ ZWE5ZWIyYWExNjdjMjYyYjUxNTU5MWZlNGI5MWUzOWYwZGI2NjQ2YTNkMTIy
15
+ YmRiNzIzNGY5ZThlNTdkMzIwODJkNjc2ZWUyNzQ5MWNlOWZlM2I=
@@ -29,6 +29,7 @@ require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/base_cache')
29
29
  require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/factory')
30
30
  require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/memcached_cache')
31
31
  require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/memory_cache')
32
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/mongo_cache')
32
33
  require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/null_cache')
33
34
 
34
35
  # Crawlers
@@ -0,0 +1,54 @@
1
+ # Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
2
+ #
3
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ # of this software and associated documentation files (the "Software"), to deal
5
+ # in the Software without restriction, including without limitation the rights
6
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ # copies of the Software, and to permit persons to whom the Software is
8
+ # furnished to do so, subject to the following conditions:
9
+ #
10
+ # The above copyright notice and this permission notice shall be included in
11
+ # all copies or substantial portions of the Software.
12
+ #
13
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
+ # THE SOFTWARE.
20
+
21
+ require File.join(File.dirname(__FILE__), 'base_cache')
22
+
23
+ require 'mongo'
24
+
25
+ module Apollo
26
+ module Cache
27
+ class MongoCache < BaseCache
28
+ def initialize
29
+ @mongo_client = Mongo::MongoClient.new('localhost', 27017, :pool_size => 5, :pool_timeout => 5)
30
+ @db = @mongo_client['apollo-crawler']
31
+ end
32
+
33
+ # Get value associated with key from cache
34
+ def get(key, *args)
35
+ res = @db['docs'].find({:url => key})
36
+
37
+ # Not found, Create, cache and return
38
+ if res.nil? || res.count < 1 && block_given?
39
+ res = yield args
40
+ return self.set(key, res)
41
+ end
42
+
43
+ return res.to_a[0]
44
+ end
45
+
46
+ # Set value associated with key
47
+ # Return cached value
48
+ def set(key, value)
49
+ @db['docs'].insert(value)
50
+ return value
51
+ end
52
+ end # MongoCache
53
+ end # Cache
54
+ end # Apollo
@@ -150,7 +150,6 @@ module Apollo
150
150
  # Format ETL result
151
151
  res = {
152
152
  :crawler => self.class.name,
153
- :title => doc.title,
154
153
  :data => data,
155
154
  :links => links
156
155
  }
@@ -158,6 +157,17 @@ module Apollo
158
157
  return res
159
158
  end
160
159
 
160
+ def self.create_metadoc(url, doc)
161
+ return {
162
+ 'url' => url,
163
+ 'doc' => doc.encode('utf-8'),
164
+ 'hash' => Digest::SHA256.new.update(doc).hexdigest,
165
+ 'created_at' => Time.now.utc,
166
+ 'expires_at' => nil,
167
+ 'version' => 0
168
+ }
169
+ end
170
+
161
171
  # Fetch document
162
172
  def fetch_document(url)
163
173
  # TODO: Refactor following idiom
@@ -171,15 +181,15 @@ module Apollo
171
181
 
172
182
  # TODO: Use some (custom-made) low-level HTTTP Protocol cache - just for sure
173
183
  cache = Apollo::Cache::Factory.instance.construct
174
- raw = cache.get(url) do
184
+ metadoc = cache.get(url) do
175
185
  max_attempts = 3
176
186
  attempt_no = 0
177
187
  success = false
178
188
 
179
- res = nil
189
+ doc = nil
180
190
  while(attempt_no < max_attempts && success == false) do
181
191
  begin
182
- res = BaseCrawler.fetch(url)
192
+ doc = BaseCrawler.fetch(url)
183
193
  success = true
184
194
  rescue Exception => e
185
195
  puts "EXCEPTION: Unable to fetch '#{url}', reason: '#{e.to_s}'"
@@ -190,12 +200,12 @@ module Apollo
190
200
  end
191
201
  end
192
202
 
193
- res
203
+ # Create metadata
204
+ BaseCrawler.create_metadoc(url, doc)
194
205
  end
195
206
 
196
207
  # TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
197
- doc = Nokogiri::HTML(raw)
198
- return doc
208
+ return Nokogiri::HTML(metadoc['doc'])
199
209
  end
200
210
 
201
211
  # Extracts data from document
@@ -18,14 +18,16 @@
18
18
  # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
19
  # THE SOFTWARE.
20
20
 
21
+ require "open-uri"
22
+ require "nokogiri"
23
+
21
24
  module Apollo
22
25
  module Fetcher
23
26
  class BaseFetcher
24
27
  def self.fetch(url)
25
28
  # TODO: Throw exception ???
26
- return nil
27
- end
28
-
29
+ return open(url).read
30
+ end
29
31
  end # class BaseFetcher
30
32
  end # module Fetcher
31
33
  end # module Apollo
@@ -28,7 +28,7 @@ module Apollo
28
28
  class SimpleFetcher < BaseFetcher
29
29
  def self.fetch(url)
30
30
  # TODO: Throw exception ???
31
- return open(url).read
31
+ return BaseFetcher::fetch(url)
32
32
  end
33
33
  end # class SimpleFetcher
34
34
  end # module Fetcher
@@ -38,11 +38,11 @@ module Apollo
38
38
  sleep(diff)
39
39
  end
40
40
 
41
- res = open(url).read
41
+ res = BaseFetcher::fetch(url)
42
42
 
43
43
  @@LAST_FETCH = DateTime.now
44
44
  return res
45
45
  end
46
46
  end # class SimpleFetcher
47
47
  end # module SmartFetcher
48
- end # module Apollo
48
+ end # module Apollo
@@ -26,6 +26,7 @@ require File.join(File.dirname(__FILE__), 'cache/base_cache')
26
26
  require File.join(File.dirname(__FILE__), 'cache/factory')
27
27
  require File.join(File.dirname(__FILE__), 'cache/memcached_cache')
28
28
  require File.join(File.dirname(__FILE__), 'cache/memory_cache')
29
+ require File.join(File.dirname(__FILE__), 'cache/mongo_cache')
29
30
  require File.join(File.dirname(__FILE__), 'cache/null_cache')
30
31
 
31
32
  # Crawlers
@@ -65,18 +65,23 @@ module Apollo
65
65
  # Initialize command-line options
66
66
  def init_options
67
67
  @options = {}
68
+
68
69
  @options[:doc_limit] = nil
69
70
  @options[:verbose] = false
70
71
  @options[:version] = false
72
+
71
73
  @options[:cache_dirs] = [
72
74
  RbConfig::CACHES_DIR
73
75
  ]
76
+
74
77
  @options[:crawler_dirs] = [
75
78
  RbConfig::CRAWLERS_DIR
76
79
  ]
80
+
77
81
  @options[:formatter_dirs] = [
78
82
  RbConfig::FORMATTERS_DIR
79
83
  ]
84
+
80
85
  @options[:generate_crawler] = nil
81
86
 
82
87
  @optparser = OptionParser.new do | opts |
@@ -139,6 +144,56 @@ module Apollo
139
144
  @optparser.parse!(args)
140
145
  end
141
146
 
147
+ def process_options(args)
148
+ if(@options[:version])
149
+ puts Apollo::VERSION
150
+ return 0
151
+ end
152
+
153
+ if(@options[:show_help])
154
+ puts @optparser
155
+ return 0
156
+ end
157
+
158
+ if(@options[:generate_crawler])
159
+ name = @options[:generate_crawler]
160
+ url = args.length > 0 ? args[0] : nil
161
+ matcher = args.length > 1 ? args[1] : nil
162
+
163
+ return self.generate_crawler(name, url, matcher)
164
+ end
165
+
166
+ if(@options[:list_formatters])
167
+ list_formatters()
168
+ return 0
169
+ end
170
+
171
+ if(@options[:list_crawlers])
172
+ list_crawlers()
173
+ return 0
174
+ end
175
+
176
+ return nil
177
+ end
178
+
179
+ def init_formatter()
180
+ # Set default formatter here
181
+ formatter_name = "json"
182
+ if(@options[:formatter])
183
+ formatter_name = @options[:formatter]
184
+ end
185
+
186
+ # Look for specified formatter
187
+ f = @formatters.select { |k, v|
188
+ name = formatter_name.gsub(Apollo::Formatter::BaseFormatter::name_re, "")
189
+ k.downcase == name
190
+ }
191
+
192
+ if(f)
193
+ @formatter = f[f.keys[0]]
194
+ end
195
+ end
196
+
142
197
  # Load global options first
143
198
  # Merge it with local options (if they exists)
144
199
  def load_config_file()
@@ -328,96 +383,31 @@ module Apollo
328
383
  return 0
329
384
  end
330
385
 
386
+ # Show tabular data in form of CLI table
331
387
  def self.console_table(headings, rows)
332
388
  table = Terminal::Table.new :headings => headings, :rows => rows
333
389
  puts table
334
390
  end
335
391
 
392
+ # List available crawlers
336
393
  def list_crawlers()
337
394
  CrawlerProgram.console_table(['name', 'class'], @crawlers)
338
395
  return
339
396
  end
340
397
 
398
+ # List available formatters
341
399
  def list_formatters()
342
400
  CrawlerProgram.console_table(['name', 'class'], @formatters)
343
401
  return
344
402
  end
345
403
 
346
- def run(args = ARGV)
347
- # puts "#{ARGV.inspect}"
348
-
349
- init_options()
350
-
351
- parse_options(args)
352
-
353
- if(@options[:version])
354
- puts Apollo::VERSION
355
- return 0
356
- end
357
-
358
- if(@options[:show_help])
359
- puts @optparser
360
- return 0
361
- end
362
-
363
- load_config_file()
364
-
365
- if(@options[:generate_crawler])
366
- name = @options[:generate_crawler]
367
- url = args.length > 0 ? args[0] : nil
368
- matcher = args.length > 1 ? args[1] : nil
369
-
370
- return self.generate_crawler(name, url, matcher)
371
- end
372
-
373
- register_modules()
374
-
375
- # Set default formatter here
376
- formatter_name = "json"
377
- if(@options[:formatter])
378
- formatter_name = @options[:formatter]
379
- end
380
-
381
- # Look for specified formatter
382
- f = @formatters.select { |k, v|
383
- name = formatter_name.gsub(Apollo::Formatter::BaseFormatter::name_re, "")
384
- k.downcase == name
385
- }
386
-
387
- if(f)
388
- @formatter = f[f.keys[0]]
389
- end
390
-
391
- if(@options[:list_formatters])
392
- list_formatters()
393
- return 0
394
- end
395
-
396
- if(@options[:list_crawlers])
397
- list_crawlers()
398
- return 0
399
- end
400
-
401
- crawlers = []
402
- if(args.length > 0)
403
- crawlers << args.shift
404
- end
405
-
406
- if(@options[:run_all])
407
- crawlers = @crawlers.keys
408
- end
409
-
410
- if(crawlers.empty?)
411
- puts @optparser
412
- return 0
413
- end
414
-
415
- crawlers.each do |crawler|
416
- crawler_name = crawler.downcase.gsub(Apollo::Crawler::BaseCrawler.name_re, "")
404
+ def run_crawlers(crawlers, args)
405
+ crawlers.each do |name|
406
+ crawler_name = name.downcase.gsub(Apollo::Crawler::BaseCrawler.name_re, "")
417
407
 
418
- p = @crawlers[crawler_name]
419
- if(p == nil)
420
- puts "Invalid crawler name - '#{crawler}'"
408
+ crawler = @crawlers[crawler_name]
409
+ if(crawler == nil)
410
+ puts "Invalid crawler name - '#{name}'"
421
411
  puts "See program help"
422
412
  return 0
423
413
  end
@@ -430,7 +420,7 @@ module Apollo
430
420
  :doc_limit => @options[:doc_limit]
431
421
  }
432
422
 
433
- res = p.new.etl(args, opts) { | docs |
423
+ res = crawler.new.etl(args, opts) { | docs |
434
424
  if(docs.nil?)
435
425
  next
436
426
  end
@@ -447,5 +437,50 @@ module Apollo
447
437
 
448
438
  return 0
449
439
  end
440
+
441
+ # Get crawlers passd to cmd-line
442
+ def get_crawlers(args)
443
+ crawlers = []
444
+ if(args.length > 0)
445
+ crawlers << args.shift
446
+ end
447
+
448
+ if(@options[:run_all])
449
+ crawlers = @crawlers.keys
450
+ end
451
+
452
+ return crawlers
453
+ end
454
+
455
+ # Init program
456
+ def init_program(args)
457
+ init_options()
458
+
459
+ parse_options(args)
460
+
461
+ load_config_file()
462
+
463
+ register_modules()
464
+
465
+ res = process_options(args)
466
+ if res != nil
467
+ return res
468
+ end
469
+
470
+ init_formatter()
471
+ end
472
+
473
+ # Run Program
474
+ def run(args = ARGV)
475
+ init_program(args)
476
+
477
+ crawlers = get_crawlers(args)
478
+ if(crawlers.empty?)
479
+ puts @optparser
480
+ return 0
481
+ end
482
+
483
+ return run_crawlers(crawlers, args)
484
+ end
450
485
  end
451
486
  end
@@ -19,5 +19,5 @@
19
19
  # THE SOFTWARE.
20
20
 
21
21
  module Apollo
22
- VERSION = '0.1.11'
22
+ VERSION = '0.1.12'
23
23
  end # Apollo
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: apollo-crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.11
4
+ version: 0.1.12
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tomas Korcak
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-02-28 00:00:00.000000000 Z
11
+ date: 2013-03-03 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: amqp
@@ -80,6 +80,20 @@ dependencies:
80
80
  - - ~>
81
81
  - !ruby/object:Gem::Version
82
82
  version: 1.5.5
83
+ - !ruby/object:Gem::Dependency
84
+ name: ffi
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ! '>='
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ! '>='
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
83
97
  - !ruby/object:Gem::Dependency
84
98
  name: eventmachine
85
99
  requirement: !ruby/object:Gem::Requirement
@@ -136,6 +150,34 @@ dependencies:
136
150
  - - ! '>='
137
151
  - !ruby/object:Gem::Version
138
152
  version: '0'
153
+ - !ruby/object:Gem::Dependency
154
+ name: mongo
155
+ requirement: !ruby/object:Gem::Requirement
156
+ requirements:
157
+ - - ! '>='
158
+ - !ruby/object:Gem::Version
159
+ version: '0'
160
+ type: :runtime
161
+ prerelease: false
162
+ version_requirements: !ruby/object:Gem::Requirement
163
+ requirements:
164
+ - - ! '>='
165
+ - !ruby/object:Gem::Version
166
+ version: '0'
167
+ - !ruby/object:Gem::Dependency
168
+ name: mongoid
169
+ requirement: !ruby/object:Gem::Requirement
170
+ requirements:
171
+ - - ! '>='
172
+ - !ruby/object:Gem::Version
173
+ version: '0'
174
+ type: :runtime
175
+ prerelease: false
176
+ version_requirements: !ruby/object:Gem::Requirement
177
+ requirements:
178
+ - - ! '>='
179
+ - !ruby/object:Gem::Version
180
+ version: '0'
139
181
  - !ruby/object:Gem::Dependency
140
182
  name: mime-types
141
183
  requirement: !ruby/object:Gem::Requirement
@@ -220,6 +262,48 @@ dependencies:
220
262
  - - ! '>='
221
263
  - !ruby/object:Gem::Version
222
264
  version: '0'
265
+ - !ruby/object:Gem::Dependency
266
+ name: guard
267
+ requirement: !ruby/object:Gem::Requirement
268
+ requirements:
269
+ - - ! '>='
270
+ - !ruby/object:Gem::Version
271
+ version: '0'
272
+ type: :development
273
+ prerelease: false
274
+ version_requirements: !ruby/object:Gem::Requirement
275
+ requirements:
276
+ - - ! '>='
277
+ - !ruby/object:Gem::Version
278
+ version: '0'
279
+ - !ruby/object:Gem::Dependency
280
+ name: guard-rake
281
+ requirement: !ruby/object:Gem::Requirement
282
+ requirements:
283
+ - - ! '>='
284
+ - !ruby/object:Gem::Version
285
+ version: '0'
286
+ type: :development
287
+ prerelease: false
288
+ version_requirements: !ruby/object:Gem::Requirement
289
+ requirements:
290
+ - - ! '>='
291
+ - !ruby/object:Gem::Version
292
+ version: '0'
293
+ - !ruby/object:Gem::Dependency
294
+ name: guard-rspec
295
+ requirement: !ruby/object:Gem::Requirement
296
+ requirements:
297
+ - - ! '>='
298
+ - !ruby/object:Gem::Version
299
+ version: '0'
300
+ type: :development
301
+ prerelease: false
302
+ version_requirements: !ruby/object:Gem::Requirement
303
+ requirements:
304
+ - - ! '>='
305
+ - !ruby/object:Gem::Version
306
+ version: '0'
223
307
  description: Gem for crawling data from external sources
224
308
  email: korczis@gmail.com
225
309
  executables:
@@ -240,6 +324,7 @@ files:
240
324
  - ./lib/apollo_crawler/cache/null_cache.rb
241
325
  - ./lib/apollo_crawler/cache/memory_cache.rb
242
326
  - ./lib/apollo_crawler/cache/base_cache.rb
327
+ - ./lib/apollo_crawler/cache/mongo_cache.rb
243
328
  - ./lib/apollo_crawler/cache/memcached_cache.rb
244
329
  - ./lib/apollo_crawler/crawler/xkcd_crawler.rb
245
330
  - ./lib/apollo_crawler/crawler/google_crawler.rb