apollo-crawler 0.1.14 → 0.1.15
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/apollo-crawler +3 -0
- data/lib/apollo_crawler.rb +12 -26
- data/lib/apollo_crawler/cache/base_cache.rb +3 -13
- data/lib/apollo_crawler/cache/caches.rb +8 -0
- data/lib/apollo_crawler/cache/memcached_cache.rb +6 -17
- data/lib/apollo_crawler/cache/memory_cache.rb +6 -17
- data/lib/apollo_crawler/cache/mongo_cache.rb +7 -4
- data/lib/apollo_crawler/cache/null_cache.rb +8 -12
- data/lib/apollo_crawler/cache/sqlite_cache.rb +53 -0
- data/lib/apollo_crawler/crawler/base_crawler.rb +3 -4
- data/lib/apollo_crawler/crawler/crawlers.rb +7 -0
- data/lib/apollo_crawler/fetcher/base_fetcher.rb +2 -1
- data/lib/apollo_crawler/fetcher/fetchers.rb +3 -0
- data/lib/apollo_crawler/formatter/formatters.rb +4 -0
- data/lib/apollo_crawler/helper/core_helper.rb +29 -0
- data/lib/apollo_crawler/helper/helpers.rb +1 -0
- data/lib/apollo_crawler/lib.rb +12 -27
- data/lib/apollo_crawler/logger/loggers.rb +2 -0
- data/lib/apollo_crawler/program/base_program.rb +29 -0
- data/lib/apollo_crawler/{program.rb → program/crawler_program.rb} +40 -38
- data/lib/apollo_crawler/program/programs.rb +2 -0
- data/lib/apollo_crawler/store/stores.rb +1 -0
- data/lib/apollo_crawler/version.rb +1 -1
- metadata +123 -78
- checksums.yaml +0 -15
data/bin/apollo-crawler
CHANGED
@@ -21,6 +21,9 @@
|
|
21
21
|
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
22
22
|
# THE SOFTWARE.
|
23
23
|
|
24
|
+
require "rubygems"
|
25
|
+
require "bundler/setup"
|
26
|
+
|
24
27
|
require File.join(File.dirname(__FILE__), "..", "lib", "apollo_crawler")
|
25
28
|
|
26
29
|
Apollo::CrawlerProgram.new.run(ARGV)
|
data/lib/apollo_crawler.rb
CHANGED
@@ -21,40 +21,26 @@
|
|
21
21
|
# TODO: Make this work - DRY!
|
22
22
|
# require File.join(File.dirname(__FILE__), 'apollo_crawler/lib')
|
23
23
|
|
24
|
-
# Main
|
25
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/program')
|
26
|
-
|
27
24
|
# Caches
|
28
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/
|
29
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/factory')
|
30
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/memcached_cache')
|
31
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/memory_cache')
|
32
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/mongo_cache')
|
33
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/null_cache')
|
25
|
+
require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/caches')
|
34
26
|
|
35
27
|
# Crawlers
|
36
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/
|
37
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/google_crawler')
|
38
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/hacker_news_crawler')
|
39
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/slashdot_crawler')
|
40
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/stackoverflow_crawler')
|
41
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/xkcd_crawler')
|
42
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/youjizz_crawler')
|
28
|
+
require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/crawlers')
|
43
29
|
|
44
30
|
# Fetchers
|
45
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/fetcher/
|
46
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/fetcher/simple_fetcher')
|
47
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/fetcher/smart_fetcher')
|
31
|
+
require File.join(File.dirname(__FILE__), 'apollo_crawler/fetcher/fetchers')
|
48
32
|
|
49
33
|
# Formatters
|
50
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/formatter/
|
51
|
-
|
52
|
-
|
53
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/
|
34
|
+
require File.join(File.dirname(__FILE__), 'apollo_crawler/formatter/formatters')
|
35
|
+
|
36
|
+
# Helpers
|
37
|
+
require File.join(File.dirname(__FILE__), 'apollo_crawler/helper/helpers')
|
54
38
|
|
55
39
|
# Loggers
|
56
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/logger/
|
57
|
-
|
40
|
+
require File.join(File.dirname(__FILE__), 'apollo_crawler/logger/loggers')
|
41
|
+
|
42
|
+
# Program
|
43
|
+
require File.join(File.dirname(__FILE__), 'apollo_crawler/program/programs')
|
58
44
|
|
59
45
|
# Stores
|
60
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/store/
|
46
|
+
require File.join(File.dirname(__FILE__), 'apollo_crawler/store/stores')
|
@@ -25,7 +25,7 @@ module Apollo
|
|
25
25
|
end
|
26
26
|
|
27
27
|
# Get value associated with key from cache
|
28
|
-
def
|
28
|
+
def try_get(key, *args)
|
29
29
|
|
30
30
|
# Not found, Create, cache and return
|
31
31
|
if block_given?
|
@@ -41,20 +41,10 @@ module Apollo
|
|
41
41
|
return value
|
42
42
|
end
|
43
43
|
|
44
|
-
|
45
|
-
|
46
|
-
return false
|
44
|
+
def remove(key)
|
45
|
+
# self.set(key, nil)
|
47
46
|
end
|
48
47
|
|
49
|
-
# Invalidate key/value pair
|
50
|
-
def invalidate(key)
|
51
|
-
return true
|
52
|
-
end
|
53
|
-
|
54
|
-
# Clear cache
|
55
|
-
def clear
|
56
|
-
return
|
57
|
-
end
|
58
48
|
end # class BaseCache
|
59
49
|
end # module Cache
|
60
50
|
end # module Apollo
|
@@ -0,0 +1,8 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), 'base_cache')
|
2
|
+
require File.join(File.dirname(__FILE__), 'memcached_cache')
|
3
|
+
require File.join(File.dirname(__FILE__), 'memory_cache')
|
4
|
+
require File.join(File.dirname(__FILE__), 'mongo_cache')
|
5
|
+
require File.join(File.dirname(__FILE__), 'null_cache')
|
6
|
+
require File.join(File.dirname(__FILE__), 'sqlite_cache')
|
7
|
+
|
8
|
+
require File.join(File.dirname(__FILE__), 'factory')
|
@@ -32,9 +32,13 @@ module Apollo
|
|
32
32
|
@cache = Dalli::Client.new()
|
33
33
|
end
|
34
34
|
|
35
|
+
def get(key)
|
36
|
+
@cache.get(key)
|
37
|
+
end
|
38
|
+
|
35
39
|
# Get value associated with key from cache
|
36
|
-
def
|
37
|
-
res =
|
40
|
+
def try_get(key, *args)
|
41
|
+
res = get(key)
|
38
42
|
|
39
43
|
# Not found, Create, cache and return
|
40
44
|
if res.nil? && block_given?
|
@@ -52,21 +56,6 @@ module Apollo
|
|
52
56
|
@cache.set(key, value)
|
53
57
|
return key
|
54
58
|
end
|
55
|
-
|
56
|
-
# Check if cache contains specified key
|
57
|
-
def contains(key)
|
58
|
-
# TODO: Implement
|
59
|
-
end
|
60
|
-
|
61
|
-
# Invalidate key/value pair
|
62
|
-
def invalidate(key)
|
63
|
-
# TODO: Implement
|
64
|
-
end
|
65
|
-
|
66
|
-
# Clear cache
|
67
|
-
def clear
|
68
|
-
# TODO: Implement
|
69
|
-
end
|
70
59
|
end # class MemcachedCache
|
71
60
|
end # module Cache
|
72
61
|
end # module Apollo
|
@@ -29,9 +29,13 @@ module Apollo
|
|
29
29
|
@cache = {}
|
30
30
|
end
|
31
31
|
|
32
|
+
def get(key)
|
33
|
+
@cache[key]
|
34
|
+
end
|
35
|
+
|
32
36
|
# Get value associated with key from cache
|
33
|
-
def
|
34
|
-
res =
|
37
|
+
def try_get(key, *args)
|
38
|
+
res = get(key)
|
35
39
|
|
36
40
|
# Not found, Create, cache and return
|
37
41
|
if res.nil? && block_given?
|
@@ -46,21 +50,6 @@ module Apollo
|
|
46
50
|
def set(key, value)
|
47
51
|
@cache[key] = value
|
48
52
|
end
|
49
|
-
|
50
|
-
# Check if cache contains specified key
|
51
|
-
def contains(key)
|
52
|
-
@cache.has_key?(key)
|
53
|
-
end
|
54
|
-
|
55
|
-
# Invalidate key/value pair
|
56
|
-
def invalidate(key)
|
57
|
-
@cache.delete(key)
|
58
|
-
end
|
59
|
-
|
60
|
-
# Clear cache
|
61
|
-
def clear
|
62
|
-
@cache.clear
|
63
|
-
end
|
64
53
|
end # class MemoryCache
|
65
54
|
end # module Cache
|
66
55
|
end # module Apollo
|
@@ -38,16 +38,19 @@ module Apollo
|
|
38
38
|
super(options)
|
39
39
|
|
40
40
|
opts = @@DEFAULT_OPTIONS.merge(options)
|
41
|
-
|
42
|
-
|
41
|
+
|
43
42
|
@mongo_client = Mongo::MongoClient.new(opts[:host], opts[:port], :pool_size => opts[:pool_size], :pool_timeout => opts[:pool_timeout])
|
44
43
|
@db = @mongo_client[opts[:db]]
|
45
44
|
@coll = @db[opts[:collection]]
|
46
45
|
end
|
47
46
|
|
47
|
+
def get(key)
|
48
|
+
@coll.find({:url => key})
|
49
|
+
end
|
50
|
+
|
48
51
|
# Get value associated with key from cache
|
49
|
-
def
|
50
|
-
res =
|
52
|
+
def try_get(key, *args)
|
53
|
+
res = get(key)
|
51
54
|
|
52
55
|
# Not found, Create, cache and return
|
53
56
|
if res.nil? || res.count < 1 && block_given?
|
@@ -27,10 +27,16 @@ module Apollo
|
|
27
27
|
super(options)
|
28
28
|
end
|
29
29
|
|
30
|
+
def get(key)
|
31
|
+
nil
|
32
|
+
end
|
33
|
+
|
30
34
|
# Get value associated with key from cache
|
31
|
-
def
|
35
|
+
def try_get(key, *args)
|
36
|
+
res = get(key)
|
37
|
+
|
32
38
|
# Not found, Create, cache and return
|
33
|
-
if block_given?
|
39
|
+
if res.nil? && block_given?
|
34
40
|
res = yield args
|
35
41
|
end
|
36
42
|
|
@@ -42,16 +48,6 @@ module Apollo
|
|
42
48
|
def set(key, value)
|
43
49
|
return value
|
44
50
|
end
|
45
|
-
|
46
|
-
# Check if cache contains specified key
|
47
|
-
def contains(key)
|
48
|
-
return false
|
49
|
-
end
|
50
|
-
|
51
|
-
# Invalidate key/value pair
|
52
|
-
def invalidate(key)
|
53
|
-
return true
|
54
|
-
end
|
55
51
|
end # NullCache
|
56
52
|
end # Cache
|
57
53
|
end # Apollo
|
@@ -0,0 +1,53 @@
|
|
1
|
+
# Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
|
2
|
+
#
|
3
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
# of this software and associated documentation files (the "Software"), to deal
|
5
|
+
# in the Software without restriction, including without limitation the rights
|
6
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
# copies of the Software, and to permit persons to whom the Software is
|
8
|
+
# furnished to do so, subject to the following conditions:
|
9
|
+
#
|
10
|
+
# The above copyright notice and this permission notice shall be included in
|
11
|
+
# all copies or substantial portions of the Software.
|
12
|
+
#
|
13
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
+
# THE SOFTWARE.
|
20
|
+
|
21
|
+
require File.join(File.dirname(__FILE__), 'base_cache')
|
22
|
+
|
23
|
+
module Apollo
|
24
|
+
module Cache
|
25
|
+
class SqliteCache < BaseCache
|
26
|
+
def initilize(options = {})
|
27
|
+
super(options)
|
28
|
+
end
|
29
|
+
|
30
|
+
def get(key)
|
31
|
+
return nil
|
32
|
+
end
|
33
|
+
|
34
|
+
# Get value associated with key from cache
|
35
|
+
def try_get(key, *args)
|
36
|
+
res = get(key)
|
37
|
+
|
38
|
+
# Not found, Create, cache and return
|
39
|
+
if res.nil? && block_given?
|
40
|
+
res = yield args
|
41
|
+
end
|
42
|
+
|
43
|
+
return res
|
44
|
+
end
|
45
|
+
|
46
|
+
# Set value associated with key
|
47
|
+
# Return cached value
|
48
|
+
def set(key, value)
|
49
|
+
return value
|
50
|
+
end
|
51
|
+
end # SqliteCache
|
52
|
+
end # Cache
|
53
|
+
end # Apollo
|
@@ -112,7 +112,7 @@ module Apollo
|
|
112
112
|
# Add document to queue of results
|
113
113
|
res << doc
|
114
114
|
|
115
|
-
enqueue_url(doc[:links]) if doc[:links]
|
115
|
+
enqueue_url(doc[:links].map(){ |l| l[:link] }) if doc[:links]
|
116
116
|
end
|
117
117
|
|
118
118
|
# Break if limit of documents to processed was reached
|
@@ -130,10 +130,9 @@ module Apollo
|
|
130
130
|
def enqueue_url(url)
|
131
131
|
urls = []
|
132
132
|
return urls if url.nil?
|
133
|
-
|
134
133
|
# We support both - list of urls or single url
|
135
134
|
if(url.kind_of?(Array))
|
136
|
-
urls.concat(url)
|
135
|
+
urls = urls.concat(url)
|
137
136
|
else
|
138
137
|
urls << url
|
139
138
|
end
|
@@ -194,7 +193,7 @@ module Apollo
|
|
194
193
|
|
195
194
|
# TODO: Use some (custom-made) low-level HTTTP Protocol cache - just for sure
|
196
195
|
cache = Apollo::Cache::Factory.instance.construct
|
197
|
-
metadoc = cache.
|
196
|
+
metadoc = cache.try_get(url) do
|
198
197
|
max_attempts = 3
|
199
198
|
attempt_no = 0
|
200
199
|
success = false
|
@@ -0,0 +1,7 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), 'base_crawler')
|
2
|
+
require File.join(File.dirname(__FILE__), 'google_crawler')
|
3
|
+
require File.join(File.dirname(__FILE__), 'hacker_news_crawler')
|
4
|
+
require File.join(File.dirname(__FILE__), 'slashdot_crawler')
|
5
|
+
require File.join(File.dirname(__FILE__), 'stackoverflow_crawler')
|
6
|
+
require File.join(File.dirname(__FILE__), 'xkcd_crawler')
|
7
|
+
require File.join(File.dirname(__FILE__), 'youjizz_crawler')
|
@@ -21,11 +21,12 @@
|
|
21
21
|
require "open-uri"
|
22
22
|
require "nokogiri"
|
23
23
|
|
24
|
+
require "em-http-request"
|
25
|
+
|
24
26
|
module Apollo
|
25
27
|
module Fetcher
|
26
28
|
class BaseFetcher
|
27
29
|
def self.fetch(url)
|
28
|
-
# TODO: Throw exception ???
|
29
30
|
return open(url).read
|
30
31
|
end
|
31
32
|
end # class BaseFetcher
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
|
2
|
+
#
|
3
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
# of this software and associated documentation files (the "Software"), to deal
|
5
|
+
# in the Software without restriction, including without limitation the rights
|
6
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
# copies of the Software, and to permit persons to whom the Software is
|
8
|
+
# furnished to do so, subject to the following conditions:
|
9
|
+
#
|
10
|
+
# The above copyright notice and this permission notice shall be included in
|
11
|
+
# all copies or substantial portions of the Software.
|
12
|
+
#
|
13
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
+
# THE SOFTWARE.
|
20
|
+
|
21
|
+
module Apollo
|
22
|
+
module Helper
|
23
|
+
class CoreHelper
|
24
|
+
def self.name_re()
|
25
|
+
return /formatter$/
|
26
|
+
end
|
27
|
+
end # class CoreHelper
|
28
|
+
end # module Helper
|
29
|
+
end # module Apollo
|
@@ -0,0 +1 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), 'core_helper')
|
data/lib/apollo_crawler/lib.rb
CHANGED
@@ -18,40 +18,25 @@
|
|
18
18
|
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
19
|
# THE SOFTWARE.
|
20
20
|
|
21
|
-
# Main
|
22
|
-
require File.join(File.dirname(__FILE__), 'program')
|
23
|
-
|
24
21
|
# Caches
|
25
|
-
require File.join(File.dirname(__FILE__), 'cache/
|
26
|
-
require File.join(File.dirname(__FILE__), 'cache/factory')
|
27
|
-
require File.join(File.dirname(__FILE__), 'cache/memcached_cache')
|
28
|
-
require File.join(File.dirname(__FILE__), 'cache/memory_cache')
|
29
|
-
require File.join(File.dirname(__FILE__), 'cache/mongo_cache')
|
30
|
-
require File.join(File.dirname(__FILE__), 'cache/null_cache')
|
22
|
+
require File.join(File.dirname(__FILE__), 'cache/caches')
|
31
23
|
|
32
24
|
# Crawlers
|
33
|
-
require File.join(File.dirname(__FILE__), 'crawler/
|
34
|
-
require File.join(File.dirname(__FILE__), 'crawler/google_crawler')
|
35
|
-
require File.join(File.dirname(__FILE__), 'crawler/hacker_news_crawler')
|
36
|
-
require File.join(File.dirname(__FILE__), 'crawler/slashdot_crawler')
|
37
|
-
require File.join(File.dirname(__FILE__), 'crawler/stackoverflow_crawler')
|
38
|
-
require File.join(File.dirname(__FILE__), 'crawler/xkcd_crawler')
|
39
|
-
require File.join(File.dirname(__FILE__), 'crawler/youjizz_crawler')
|
25
|
+
require File.join(File.dirname(__FILE__), 'crawler/crawlers')
|
40
26
|
|
41
27
|
# Fetchers
|
42
|
-
require File.join(File.dirname(__FILE__), 'fetcher/
|
43
|
-
require File.join(File.dirname(__FILE__), 'fetcher/simple_fetcher')
|
44
|
-
require File.join(File.dirname(__FILE__), 'fetcher/smart_fetcher')
|
45
|
-
|
28
|
+
require File.join(File.dirname(__FILE__), 'fetcher/fetchers')
|
46
29
|
# Formatters
|
47
|
-
require File.join(File.dirname(__FILE__), 'formatter/
|
48
|
-
|
49
|
-
|
50
|
-
require File.join(File.dirname(__FILE__), '
|
30
|
+
require File.join(File.dirname(__FILE__), 'formatter/formatters')
|
31
|
+
|
32
|
+
# Helpers
|
33
|
+
require File.join(File.dirname(__FILE__), 'helper/helpers')
|
51
34
|
|
52
35
|
# Loggers
|
53
|
-
require File.join(File.dirname(__FILE__), 'logger/
|
54
|
-
|
36
|
+
require File.join(File.dirname(__FILE__), 'logger/loggers')
|
37
|
+
|
38
|
+
# Programs
|
39
|
+
require File.join(File.dirname(__FILE__), 'program/programs')
|
55
40
|
|
56
41
|
# Stores
|
57
|
-
require File.join(File.dirname(__FILE__), 'store/
|
42
|
+
require File.join(File.dirname(__FILE__), 'store/stores')
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
|
2
|
+
#
|
3
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
# of this software and associated documentation files (the "Software"), to deal
|
5
|
+
# in the Software without restriction, including without limitation the rights
|
6
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
# copies of the Software, and to permit persons to whom the Software is
|
8
|
+
# furnished to do so, subject to the following conditions:
|
9
|
+
#
|
10
|
+
# The above copyright notice and this permission notice shall be included in
|
11
|
+
# all copies or substantial portions of the Software.
|
12
|
+
#
|
13
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
+
# THE SOFTWARE.
|
20
|
+
|
21
|
+
module Apollo
|
22
|
+
class BaseProgram
|
23
|
+
def self.require_files(files = [])
|
24
|
+
Dir.glob(files).each do |file|
|
25
|
+
require file
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end # class BaseProgram
|
29
|
+
end # module Apollo
|
@@ -18,9 +18,6 @@
|
|
18
18
|
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
19
|
# THE SOFTWARE.
|
20
20
|
|
21
|
-
require "rubygems"
|
22
|
-
require "bundler/setup"
|
23
|
-
|
24
21
|
require 'json'
|
25
22
|
|
26
23
|
require "thor"
|
@@ -39,15 +36,17 @@ require 'terminal-table'
|
|
39
36
|
require 'eventmachine'
|
40
37
|
require 'em-http'
|
41
38
|
|
42
|
-
require File.join(File.dirname(__FILE__), 'version')
|
39
|
+
require File.join(File.dirname(__FILE__), '..', 'version')
|
43
40
|
|
44
41
|
# require File.join(File.dirname(__FILE__), 'config/crawler')
|
45
42
|
# puts Apollo::CrawlerProgramConfig
|
46
43
|
|
44
|
+
require File.join(File.dirname(__FILE__),'base_program')
|
45
|
+
|
47
46
|
module Apollo
|
48
|
-
class CrawlerProgram
|
47
|
+
class CrawlerProgram < BaseProgram
|
49
48
|
# Load default config
|
50
|
-
require File.join(File.dirname(__FILE__), "config")
|
49
|
+
require File.join(File.dirname(__FILE__), "..", "config")
|
51
50
|
|
52
51
|
# This hash will hold all of the options
|
53
52
|
# parsed from the command-line by OptionParser.
|
@@ -235,10 +234,7 @@ module Apollo
|
|
235
234
|
puts "Registering caches - '#{dir}'"
|
236
235
|
end
|
237
236
|
|
238
|
-
|
239
|
-
Dir.glob(files).each do |file|
|
240
|
-
require file
|
241
|
-
end
|
237
|
+
BaseProgram.require_files(File.join(dir, "**", "*.rb"))
|
242
238
|
|
243
239
|
tmp = Apollo::Cache.constants.select { |c|
|
244
240
|
Class === Apollo::Cache.const_get(c)
|
@@ -269,11 +265,8 @@ module Apollo
|
|
269
265
|
puts "Registering crawlers - '#{dir}'"
|
270
266
|
end
|
271
267
|
|
272
|
-
|
273
|
-
|
274
|
-
require file
|
275
|
-
end
|
276
|
-
|
268
|
+
BaseProgram.require_files(File.join(dir, "**", "*.rb"))
|
269
|
+
|
277
270
|
tmp = Apollo::Crawler.constants.select { |c|
|
278
271
|
Class === Apollo::Crawler.const_get(c)
|
279
272
|
}
|
@@ -303,10 +296,7 @@ module Apollo
|
|
303
296
|
puts "Registering formatters - '#{dir}'"
|
304
297
|
end
|
305
298
|
|
306
|
-
|
307
|
-
Dir.glob(files).each do |file|
|
308
|
-
require file
|
309
|
-
end
|
299
|
+
BaseProgram.require_files(File.join(dir, "**", "*.rb"))
|
310
300
|
|
311
301
|
tmp = Apollo::Formatter.constants.select { |c|
|
312
302
|
Class === Apollo::Formatter.const_get(c)
|
@@ -348,7 +338,7 @@ module Apollo
|
|
348
338
|
end
|
349
339
|
end
|
350
340
|
|
351
|
-
def generate_crawler(name, url = nil, matcher = nil)
|
341
|
+
def generate_crawler(name, url = nil, matcher = nil, options = @options)
|
352
342
|
name = name.titleize.gsub(" ", "")
|
353
343
|
|
354
344
|
if(@options[:verbose])
|
@@ -362,11 +352,13 @@ module Apollo
|
|
362
352
|
return -1
|
363
353
|
end
|
364
354
|
|
365
|
-
if(
|
355
|
+
if(options[:verbose])
|
366
356
|
puts "Using template '#{template_path}'"
|
367
357
|
end
|
368
358
|
|
369
|
-
|
359
|
+
unless(options[:silent])
|
360
|
+
dest_path = File.join(Dir.pwd, "#{name.underscore}.rb")
|
361
|
+
end
|
370
362
|
|
371
363
|
url = url ? url : "http://some-url-here"
|
372
364
|
matcher = matcher ? matcher : "//a"
|
@@ -414,23 +406,31 @@ module Apollo
|
|
414
406
|
return
|
415
407
|
end
|
416
408
|
|
417
|
-
def
|
409
|
+
def get_crawlers_by_name(crawlers, crawler_classes = @crawlers)
|
410
|
+
res = []
|
411
|
+
|
418
412
|
crawlers.each do |name|
|
419
413
|
crawler_name = name.downcase.gsub(Apollo::Crawler::BaseCrawler.name_re, "")
|
420
414
|
|
421
|
-
crawler =
|
415
|
+
crawler = crawler_classes[crawler_name]
|
422
416
|
if(crawler == nil)
|
423
|
-
|
424
|
-
puts "See program help"
|
425
|
-
return 0
|
417
|
+
next
|
426
418
|
end
|
427
419
|
|
428
|
-
|
420
|
+
res << crawler
|
421
|
+
end
|
422
|
+
|
423
|
+
return res
|
424
|
+
end
|
425
|
+
|
426
|
+
def run_crawlers(crawlers, args, options = @options)
|
427
|
+
crawlers.each do |crawler|
|
428
|
+
if(options[:verbose])
|
429
429
|
puts "Running '#{crawler}'"
|
430
430
|
end
|
431
431
|
|
432
432
|
opts = {
|
433
|
-
:doc_limit =>
|
433
|
+
:doc_limit => options[:doc_limit]
|
434
434
|
}
|
435
435
|
|
436
436
|
res = crawler.new.etl(args, opts) { | docs |
|
@@ -442,23 +442,23 @@ module Apollo
|
|
442
442
|
end
|
443
443
|
|
444
444
|
# Get crawlers passd to cmd-line
|
445
|
-
def get_crawlers(args,
|
445
|
+
def get_crawlers(args, options = @options)
|
446
446
|
crawlers = []
|
447
447
|
if(args.length > 0)
|
448
448
|
crawlers << args.shift
|
449
449
|
end
|
450
450
|
|
451
|
-
if(
|
451
|
+
if(options[:run_all])
|
452
452
|
crawlers = @crawlers.keys
|
453
453
|
end
|
454
454
|
|
455
455
|
return crawlers
|
456
456
|
end
|
457
457
|
|
458
|
-
def init_program_directory(base_dir = RbConfig::PROGRAM_DIRECTORY, dirs = RbConfig::PROGRAM_DIRECTORIES,
|
458
|
+
def init_program_directory(base_dir = RbConfig::PROGRAM_DIRECTORY, dirs = RbConfig::PROGRAM_DIRECTORIES, options = @options)
|
459
459
|
dirs.each do |dir|
|
460
460
|
if(File.directory?(dir) == false)
|
461
|
-
if(
|
461
|
+
if(options[:verbose])
|
462
462
|
puts "Creating '#{dir}'"
|
463
463
|
end
|
464
464
|
|
@@ -469,10 +469,10 @@ module Apollo
|
|
469
469
|
init_user_config_file(File.join(File.dirname(__FILE__), 'config_user.trb'), File.join(base_dir, 'config.rb'))
|
470
470
|
end
|
471
471
|
|
472
|
-
def init_user_config_file(config_path, dest_path,
|
472
|
+
def init_user_config_file(config_path, dest_path, options = @options)
|
473
473
|
# Create user config file
|
474
474
|
if(File.exists?(config_path) && File.exists?(dest_path) == false)
|
475
|
-
if(
|
475
|
+
if(options[:verbose])
|
476
476
|
puts "Creating user config file '#{config_path}' => '#{dest_path}'"
|
477
477
|
end
|
478
478
|
|
@@ -511,13 +511,15 @@ module Apollo
|
|
511
511
|
return request_exit(res_code)
|
512
512
|
end
|
513
513
|
|
514
|
-
|
515
|
-
if(
|
514
|
+
crawler_names = get_crawlers(args)
|
515
|
+
if(crawler_names.empty?)
|
516
516
|
puts @optparser
|
517
517
|
return request_exit(0)
|
518
518
|
end
|
519
519
|
|
520
|
-
|
520
|
+
crawlers = get_crawlers_by_name(crawler_names, @crawlers)
|
521
|
+
|
522
|
+
res_code = run_crawlers(crawlers, args, @options)
|
521
523
|
return request_exit(res_code)
|
522
524
|
end
|
523
525
|
|
@@ -0,0 +1 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), 'base_store')
|
metadata
CHANGED
@@ -1,7 +1,8 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: apollo-crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.15
|
5
|
+
prerelease:
|
5
6
|
platform: ruby
|
6
7
|
authors:
|
7
8
|
- Tomas Korcak
|
@@ -10,51 +11,42 @@ bindir: bin
|
|
10
11
|
cert_chain: []
|
11
12
|
date: 2013-03-03 00:00:00.000000000 Z
|
12
13
|
dependencies:
|
13
|
-
- !ruby/object:Gem::Dependency
|
14
|
-
name: amqp
|
15
|
-
requirement: !ruby/object:Gem::Requirement
|
16
|
-
requirements:
|
17
|
-
- - ! '>='
|
18
|
-
- !ruby/object:Gem::Version
|
19
|
-
version: '0'
|
20
|
-
type: :runtime
|
21
|
-
prerelease: false
|
22
|
-
version_requirements: !ruby/object:Gem::Requirement
|
23
|
-
requirements:
|
24
|
-
- - ! '>='
|
25
|
-
- !ruby/object:Gem::Version
|
26
|
-
version: '0'
|
27
14
|
- !ruby/object:Gem::Dependency
|
28
15
|
name: awesome_print
|
29
16
|
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
30
18
|
requirements:
|
31
|
-
- -
|
19
|
+
- - ~>
|
32
20
|
- !ruby/object:Gem::Version
|
33
|
-
version:
|
21
|
+
version: 1.1.0
|
34
22
|
type: :runtime
|
35
23
|
prerelease: false
|
36
24
|
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
37
26
|
requirements:
|
38
|
-
- -
|
27
|
+
- - ~>
|
39
28
|
- !ruby/object:Gem::Version
|
40
|
-
version:
|
29
|
+
version: 1.1.0
|
41
30
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
31
|
+
name: activesupport
|
43
32
|
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
44
34
|
requirements:
|
45
35
|
- - ! '>='
|
46
36
|
- !ruby/object:Gem::Version
|
47
|
-
version:
|
37
|
+
version: 3.2.12
|
48
38
|
type: :runtime
|
49
39
|
prerelease: false
|
50
40
|
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
51
42
|
requirements:
|
52
43
|
- - ! '>='
|
53
44
|
- !ruby/object:Gem::Version
|
54
|
-
version:
|
45
|
+
version: 3.2.12
|
55
46
|
- !ruby/object:Gem::Dependency
|
56
47
|
name: dalli
|
57
48
|
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
58
50
|
requirements:
|
59
51
|
- - ! '>='
|
60
52
|
- !ruby/object:Gem::Version
|
@@ -62,6 +54,7 @@ dependencies:
|
|
62
54
|
type: :runtime
|
63
55
|
prerelease: false
|
64
56
|
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
65
58
|
requirements:
|
66
59
|
- - ! '>='
|
67
60
|
- !ruby/object:Gem::Version
|
@@ -69,6 +62,7 @@ dependencies:
|
|
69
62
|
- !ruby/object:Gem::Dependency
|
70
63
|
name: fastercsv
|
71
64
|
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
72
66
|
requirements:
|
73
67
|
- - ~>
|
74
68
|
- !ruby/object:Gem::Version
|
@@ -76,13 +70,15 @@ dependencies:
|
|
76
70
|
type: :runtime
|
77
71
|
prerelease: false
|
78
72
|
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
79
74
|
requirements:
|
80
75
|
- - ~>
|
81
76
|
- !ruby/object:Gem::Version
|
82
77
|
version: 1.5.5
|
83
78
|
- !ruby/object:Gem::Dependency
|
84
|
-
name:
|
79
|
+
name: eventmachine
|
85
80
|
requirement: !ruby/object:Gem::Requirement
|
81
|
+
none: false
|
86
82
|
requirements:
|
87
83
|
- - ! '>='
|
88
84
|
- !ruby/object:Gem::Version
|
@@ -90,6 +86,7 @@ dependencies:
|
|
90
86
|
type: :runtime
|
91
87
|
prerelease: false
|
92
88
|
version_requirements: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
93
90
|
requirements:
|
94
91
|
- - ! '>='
|
95
92
|
- !ruby/object:Gem::Version
|
@@ -97,6 +94,7 @@ dependencies:
|
|
97
94
|
- !ruby/object:Gem::Dependency
|
98
95
|
name: em-http-request
|
99
96
|
requirement: !ruby/object:Gem::Requirement
|
97
|
+
none: false
|
100
98
|
requirements:
|
101
99
|
- - ! '>='
|
102
100
|
- !ruby/object:Gem::Version
|
@@ -104,6 +102,7 @@ dependencies:
|
|
104
102
|
type: :runtime
|
105
103
|
prerelease: false
|
106
104
|
version_requirements: !ruby/object:Gem::Requirement
|
105
|
+
none: false
|
107
106
|
requirements:
|
108
107
|
- - ! '>='
|
109
108
|
- !ruby/object:Gem::Version
|
@@ -111,6 +110,7 @@ dependencies:
|
|
111
110
|
- !ruby/object:Gem::Dependency
|
112
111
|
name: em-synchrony
|
113
112
|
requirement: !ruby/object:Gem::Requirement
|
113
|
+
none: false
|
114
114
|
requirements:
|
115
115
|
- - ! '>='
|
116
116
|
- !ruby/object:Gem::Version
|
@@ -118,41 +118,47 @@ dependencies:
|
|
118
118
|
type: :runtime
|
119
119
|
prerelease: false
|
120
120
|
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
none: false
|
121
122
|
requirements:
|
122
123
|
- - ! '>='
|
123
124
|
- !ruby/object:Gem::Version
|
124
125
|
version: '0'
|
125
126
|
- !ruby/object:Gem::Dependency
|
126
|
-
name:
|
127
|
+
name: amqp
|
127
128
|
requirement: !ruby/object:Gem::Requirement
|
129
|
+
none: false
|
128
130
|
requirements:
|
129
|
-
- -
|
131
|
+
- - ~>
|
130
132
|
- !ruby/object:Gem::Version
|
131
|
-
version:
|
133
|
+
version: 0.9.9
|
132
134
|
type: :runtime
|
133
135
|
prerelease: false
|
134
136
|
version_requirements: !ruby/object:Gem::Requirement
|
137
|
+
none: false
|
135
138
|
requirements:
|
136
|
-
- -
|
139
|
+
- - ~>
|
137
140
|
- !ruby/object:Gem::Version
|
138
|
-
version:
|
141
|
+
version: 0.9.9
|
139
142
|
- !ruby/object:Gem::Dependency
|
140
|
-
name:
|
143
|
+
name: json
|
141
144
|
requirement: !ruby/object:Gem::Requirement
|
145
|
+
none: false
|
142
146
|
requirements:
|
143
|
-
- -
|
147
|
+
- - ~>
|
144
148
|
- !ruby/object:Gem::Version
|
145
|
-
version:
|
149
|
+
version: 1.7.1
|
146
150
|
type: :runtime
|
147
151
|
prerelease: false
|
148
152
|
version_requirements: !ruby/object:Gem::Requirement
|
153
|
+
none: false
|
149
154
|
requirements:
|
150
|
-
- -
|
155
|
+
- - ~>
|
151
156
|
- !ruby/object:Gem::Version
|
152
|
-
version:
|
157
|
+
version: 1.7.1
|
153
158
|
- !ruby/object:Gem::Dependency
|
154
|
-
name:
|
159
|
+
name: memcache-client
|
155
160
|
requirement: !ruby/object:Gem::Requirement
|
161
|
+
none: false
|
156
162
|
requirements:
|
157
163
|
- - ! '>='
|
158
164
|
- !ruby/object:Gem::Version
|
@@ -160,13 +166,15 @@ dependencies:
|
|
160
166
|
type: :runtime
|
161
167
|
prerelease: false
|
162
168
|
version_requirements: !ruby/object:Gem::Requirement
|
169
|
+
none: false
|
163
170
|
requirements:
|
164
171
|
- - ! '>='
|
165
172
|
- !ruby/object:Gem::Version
|
166
173
|
version: '0'
|
167
174
|
- !ruby/object:Gem::Dependency
|
168
|
-
name:
|
175
|
+
name: mongo
|
169
176
|
requirement: !ruby/object:Gem::Requirement
|
177
|
+
none: false
|
170
178
|
requirements:
|
171
179
|
- - ! '>='
|
172
180
|
- !ruby/object:Gem::Version
|
@@ -174,13 +182,15 @@ dependencies:
|
|
174
182
|
type: :runtime
|
175
183
|
prerelease: false
|
176
184
|
version_requirements: !ruby/object:Gem::Requirement
|
185
|
+
none: false
|
177
186
|
requirements:
|
178
187
|
- - ! '>='
|
179
188
|
- !ruby/object:Gem::Version
|
180
189
|
version: '0'
|
181
190
|
- !ruby/object:Gem::Dependency
|
182
|
-
name:
|
191
|
+
name: mongoid
|
183
192
|
requirement: !ruby/object:Gem::Requirement
|
193
|
+
none: false
|
184
194
|
requirements:
|
185
195
|
- - ! '>='
|
186
196
|
- !ruby/object:Gem::Version
|
@@ -188,13 +198,15 @@ dependencies:
|
|
188
198
|
type: :runtime
|
189
199
|
prerelease: false
|
190
200
|
version_requirements: !ruby/object:Gem::Requirement
|
201
|
+
none: false
|
191
202
|
requirements:
|
192
203
|
- - ! '>='
|
193
204
|
- !ruby/object:Gem::Version
|
194
205
|
version: '0'
|
195
206
|
- !ruby/object:Gem::Dependency
|
196
|
-
name:
|
207
|
+
name: mime-types
|
197
208
|
requirement: !ruby/object:Gem::Requirement
|
209
|
+
none: false
|
198
210
|
requirements:
|
199
211
|
- - ! '>='
|
200
212
|
- !ruby/object:Gem::Version
|
@@ -202,27 +214,31 @@ dependencies:
|
|
202
214
|
type: :runtime
|
203
215
|
prerelease: false
|
204
216
|
version_requirements: !ruby/object:Gem::Requirement
|
217
|
+
none: false
|
205
218
|
requirements:
|
206
219
|
- - ! '>='
|
207
220
|
- !ruby/object:Gem::Version
|
208
221
|
version: '0'
|
209
222
|
- !ruby/object:Gem::Dependency
|
210
|
-
name:
|
223
|
+
name: nokogiri
|
211
224
|
requirement: !ruby/object:Gem::Requirement
|
225
|
+
none: false
|
212
226
|
requirements:
|
213
|
-
- -
|
227
|
+
- - ~>
|
214
228
|
- !ruby/object:Gem::Version
|
215
|
-
version:
|
229
|
+
version: 1.5.6
|
216
230
|
type: :runtime
|
217
231
|
prerelease: false
|
218
232
|
version_requirements: !ruby/object:Gem::Requirement
|
233
|
+
none: false
|
219
234
|
requirements:
|
220
|
-
- -
|
235
|
+
- - ~>
|
221
236
|
- !ruby/object:Gem::Version
|
222
|
-
version:
|
237
|
+
version: 1.5.6
|
223
238
|
- !ruby/object:Gem::Dependency
|
224
|
-
name:
|
239
|
+
name: openurl
|
225
240
|
requirement: !ruby/object:Gem::Requirement
|
241
|
+
none: false
|
226
242
|
requirements:
|
227
243
|
- - ! '>='
|
228
244
|
- !ruby/object:Gem::Version
|
@@ -230,13 +246,15 @@ dependencies:
|
|
230
246
|
type: :runtime
|
231
247
|
prerelease: false
|
232
248
|
version_requirements: !ruby/object:Gem::Requirement
|
249
|
+
none: false
|
233
250
|
requirements:
|
234
251
|
- - ! '>='
|
235
252
|
- !ruby/object:Gem::Version
|
236
253
|
version: '0'
|
237
254
|
- !ruby/object:Gem::Dependency
|
238
|
-
name:
|
255
|
+
name: parallel
|
239
256
|
requirement: !ruby/object:Gem::Requirement
|
257
|
+
none: false
|
240
258
|
requirements:
|
241
259
|
- - ! '>='
|
242
260
|
- !ruby/object:Gem::Version
|
@@ -244,69 +262,79 @@ dependencies:
|
|
244
262
|
type: :runtime
|
245
263
|
prerelease: false
|
246
264
|
version_requirements: !ruby/object:Gem::Requirement
|
265
|
+
none: false
|
247
266
|
requirements:
|
248
267
|
- - ! '>='
|
249
268
|
- !ruby/object:Gem::Version
|
250
269
|
version: '0'
|
251
270
|
- !ruby/object:Gem::Dependency
|
252
|
-
name:
|
271
|
+
name: rack
|
253
272
|
requirement: !ruby/object:Gem::Requirement
|
273
|
+
none: false
|
254
274
|
requirements:
|
255
275
|
- - ! '>='
|
256
276
|
- !ruby/object:Gem::Version
|
257
|
-
version:
|
277
|
+
version: 1.5.2
|
258
278
|
type: :runtime
|
259
279
|
prerelease: false
|
260
280
|
version_requirements: !ruby/object:Gem::Requirement
|
281
|
+
none: false
|
261
282
|
requirements:
|
262
283
|
- - ! '>='
|
263
284
|
- !ruby/object:Gem::Version
|
264
|
-
version:
|
285
|
+
version: 1.5.2
|
265
286
|
- !ruby/object:Gem::Dependency
|
266
287
|
name: terminal-table
|
267
288
|
requirement: !ruby/object:Gem::Requirement
|
289
|
+
none: false
|
268
290
|
requirements:
|
269
|
-
- -
|
291
|
+
- - ~>
|
270
292
|
- !ruby/object:Gem::Version
|
271
|
-
version:
|
293
|
+
version: 1.4.5
|
272
294
|
type: :runtime
|
273
295
|
prerelease: false
|
274
296
|
version_requirements: !ruby/object:Gem::Requirement
|
297
|
+
none: false
|
275
298
|
requirements:
|
276
|
-
- -
|
299
|
+
- - ~>
|
277
300
|
- !ruby/object:Gem::Version
|
278
|
-
version:
|
301
|
+
version: 1.4.5
|
279
302
|
- !ruby/object:Gem::Dependency
|
280
303
|
name: thor
|
281
304
|
requirement: !ruby/object:Gem::Requirement
|
305
|
+
none: false
|
282
306
|
requirements:
|
283
|
-
- -
|
307
|
+
- - ~>
|
284
308
|
- !ruby/object:Gem::Version
|
285
|
-
version:
|
309
|
+
version: 0.17.0
|
286
310
|
type: :runtime
|
287
311
|
prerelease: false
|
288
312
|
version_requirements: !ruby/object:Gem::Requirement
|
313
|
+
none: false
|
289
314
|
requirements:
|
290
|
-
- -
|
315
|
+
- - ~>
|
291
316
|
- !ruby/object:Gem::Version
|
292
|
-
version:
|
317
|
+
version: 0.17.0
|
293
318
|
- !ruby/object:Gem::Dependency
|
294
|
-
name:
|
319
|
+
name: writeexcel
|
295
320
|
requirement: !ruby/object:Gem::Requirement
|
321
|
+
none: false
|
296
322
|
requirements:
|
297
|
-
- -
|
323
|
+
- - ~>
|
298
324
|
- !ruby/object:Gem::Version
|
299
|
-
version:
|
325
|
+
version: 0.6.18
|
300
326
|
type: :runtime
|
301
327
|
prerelease: false
|
302
328
|
version_requirements: !ruby/object:Gem::Requirement
|
329
|
+
none: false
|
303
330
|
requirements:
|
304
|
-
- -
|
331
|
+
- - ~>
|
305
332
|
- !ruby/object:Gem::Version
|
306
|
-
version:
|
333
|
+
version: 0.6.18
|
307
334
|
- !ruby/object:Gem::Dependency
|
308
335
|
name: guard
|
309
336
|
requirement: !ruby/object:Gem::Requirement
|
337
|
+
none: false
|
310
338
|
requirements:
|
311
339
|
- - ! '>='
|
312
340
|
- !ruby/object:Gem::Version
|
@@ -314,6 +342,7 @@ dependencies:
|
|
314
342
|
type: :development
|
315
343
|
prerelease: false
|
316
344
|
version_requirements: !ruby/object:Gem::Requirement
|
345
|
+
none: false
|
317
346
|
requirements:
|
318
347
|
- - ! '>='
|
319
348
|
- !ruby/object:Gem::Version
|
@@ -321,6 +350,7 @@ dependencies:
|
|
321
350
|
- !ruby/object:Gem::Dependency
|
322
351
|
name: guard-rake
|
323
352
|
requirement: !ruby/object:Gem::Requirement
|
353
|
+
none: false
|
324
354
|
requirements:
|
325
355
|
- - ! '>='
|
326
356
|
- !ruby/object:Gem::Version
|
@@ -328,6 +358,7 @@ dependencies:
|
|
328
358
|
type: :development
|
329
359
|
prerelease: false
|
330
360
|
version_requirements: !ruby/object:Gem::Requirement
|
361
|
+
none: false
|
331
362
|
requirements:
|
332
363
|
- - ! '>='
|
333
364
|
- !ruby/object:Gem::Version
|
@@ -335,6 +366,7 @@ dependencies:
|
|
335
366
|
- !ruby/object:Gem::Dependency
|
336
367
|
name: guard-rspec
|
337
368
|
requirement: !ruby/object:Gem::Requirement
|
369
|
+
none: false
|
338
370
|
requirements:
|
339
371
|
- - ! '>='
|
340
372
|
- !ruby/object:Gem::Version
|
@@ -342,6 +374,7 @@ dependencies:
|
|
342
374
|
type: :development
|
343
375
|
prerelease: false
|
344
376
|
version_requirements: !ruby/object:Gem::Requirement
|
377
|
+
none: false
|
345
378
|
requirements:
|
346
379
|
- - ! '>='
|
347
380
|
- !ruby/object:Gem::Version
|
@@ -353,57 +386,69 @@ executables:
|
|
353
386
|
extensions: []
|
354
387
|
extra_rdoc_files: []
|
355
388
|
files:
|
389
|
+
- ./lib/apollo_crawler/lib.rb
|
390
|
+
- ./lib/apollo_crawler/program/base_program.rb
|
391
|
+
- ./lib/apollo_crawler/program/programs.rb
|
392
|
+
- ./lib/apollo_crawler/program/crawler_program.rb
|
356
393
|
- ./lib/apollo_crawler/fetcher/smart_fetcher.rb
|
357
394
|
- ./lib/apollo_crawler/fetcher/simple_fetcher.rb
|
358
395
|
- ./lib/apollo_crawler/fetcher/base_fetcher.rb
|
359
|
-
- ./lib/apollo_crawler/
|
396
|
+
- ./lib/apollo_crawler/fetcher/fetchers.rb
|
360
397
|
- ./lib/apollo_crawler/version.rb
|
361
|
-
- ./lib/apollo_crawler/logger/console_logger.rb
|
362
|
-
- ./lib/apollo_crawler/logger/base_logger.rb
|
363
|
-
- ./lib/apollo_crawler/program.rb
|
364
|
-
- ./lib/apollo_crawler/config.rb
|
365
|
-
- ./lib/apollo_crawler/cache/factory.rb
|
366
|
-
- ./lib/apollo_crawler/cache/null_cache.rb
|
367
|
-
- ./lib/apollo_crawler/cache/memory_cache.rb
|
368
|
-
- ./lib/apollo_crawler/cache/base_cache.rb
|
369
|
-
- ./lib/apollo_crawler/cache/mongo_cache.rb
|
370
|
-
- ./lib/apollo_crawler/cache/memcached_cache.rb
|
371
|
-
- ./lib/apollo_crawler/crawler/xkcd_crawler.rb
|
372
|
-
- ./lib/apollo_crawler/crawler/google_crawler.rb
|
373
|
-
- ./lib/apollo_crawler/crawler/youjizz_crawler.rb
|
374
398
|
- ./lib/apollo_crawler/crawler/slashdot_crawler.rb
|
399
|
+
- ./lib/apollo_crawler/crawler/stackoverflow_crawler.rb
|
375
400
|
- ./lib/apollo_crawler/crawler/hacker_news_crawler.rb
|
401
|
+
- ./lib/apollo_crawler/crawler/youjizz_crawler.rb
|
402
|
+
- ./lib/apollo_crawler/crawler/google_crawler.rb
|
376
403
|
- ./lib/apollo_crawler/crawler/base_crawler.rb
|
377
|
-
- ./lib/apollo_crawler/crawler/
|
404
|
+
- ./lib/apollo_crawler/crawler/xkcd_crawler.rb
|
405
|
+
- ./lib/apollo_crawler/crawler/crawlers.rb
|
406
|
+
- ./lib/apollo_crawler/logger/loggers.rb
|
407
|
+
- ./lib/apollo_crawler/logger/console_logger.rb
|
408
|
+
- ./lib/apollo_crawler/logger/base_logger.rb
|
409
|
+
- ./lib/apollo_crawler/config.rb
|
378
410
|
- ./lib/apollo_crawler/formatter/table_formatter.rb
|
379
411
|
- ./lib/apollo_crawler/formatter/base_formatter.rb
|
380
412
|
- ./lib/apollo_crawler/formatter/json_formatter.rb
|
381
413
|
- ./lib/apollo_crawler/formatter/plain_formatter.rb
|
414
|
+
- ./lib/apollo_crawler/formatter/formatters.rb
|
415
|
+
- ./lib/apollo_crawler/cache/memcached_cache.rb
|
416
|
+
- ./lib/apollo_crawler/cache/memory_cache.rb
|
417
|
+
- ./lib/apollo_crawler/cache/mongo_cache.rb
|
418
|
+
- ./lib/apollo_crawler/cache/null_cache.rb
|
419
|
+
- ./lib/apollo_crawler/cache/factory.rb
|
420
|
+
- ./lib/apollo_crawler/cache/caches.rb
|
421
|
+
- ./lib/apollo_crawler/cache/sqlite_cache.rb
|
422
|
+
- ./lib/apollo_crawler/cache/base_cache.rb
|
382
423
|
- ./lib/apollo_crawler/store/base_store.rb
|
424
|
+
- ./lib/apollo_crawler/store/stores.rb
|
425
|
+
- ./lib/apollo_crawler/helper/core_helper.rb
|
426
|
+
- ./lib/apollo_crawler/helper/helpers.rb
|
383
427
|
- ./lib/apollo_crawler.rb
|
384
428
|
- bin/apollo-crawler
|
385
429
|
homepage: https://github.com/korczis/apollo-crawler
|
386
430
|
licenses:
|
387
431
|
- MIT
|
388
|
-
metadata: {}
|
389
432
|
post_install_message: Thanks for installing Apollo Crawler!
|
390
433
|
rdoc_options: []
|
391
434
|
require_paths:
|
392
435
|
- lib
|
393
436
|
required_ruby_version: !ruby/object:Gem::Requirement
|
437
|
+
none: false
|
394
438
|
requirements:
|
395
439
|
- - ! '>='
|
396
440
|
- !ruby/object:Gem::Version
|
397
441
|
version: '0'
|
398
442
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
443
|
+
none: false
|
399
444
|
requirements:
|
400
445
|
- - ! '>='
|
401
446
|
- !ruby/object:Gem::Version
|
402
447
|
version: '0'
|
403
448
|
requirements: []
|
404
449
|
rubyforge_project:
|
405
|
-
rubygems_version:
|
450
|
+
rubygems_version: 1.8.23
|
406
451
|
signing_key:
|
407
|
-
specification_version:
|
452
|
+
specification_version: 3
|
408
453
|
summary: Apollo Platform Crawler
|
409
454
|
test_files: []
|
checksums.yaml
DELETED
@@ -1,15 +0,0 @@
|
|
1
|
-
---
|
2
|
-
!binary "U0hBMQ==":
|
3
|
-
metadata.gz: !binary |-
|
4
|
-
NDZmNjQ3N2FkZmVkYjc5NjQ3NjJiZTMyZGFlYjY4ODA0ZDgwYWE2OA==
|
5
|
-
data.tar.gz: !binary |-
|
6
|
-
OTA2NmJhOTEyYmJiMTFiYTFmZWFjNzY3YmEzYjYyZTc0MzZlMDk1Mw==
|
7
|
-
!binary "U0hBNTEy":
|
8
|
-
metadata.gz: !binary |-
|
9
|
-
MDk1NDcwN2FhYjA0YTVjNTA4N2FhOWRjNzcyMTJlMDg3ZjlmNmM2NTQyMjY2
|
10
|
-
ZTZjNjk1YjQ2MmYxMGU2MzViNmJkYjU3OTFlMDk2MTEzMjE2MTdkMGU3NWQ4
|
11
|
-
ZDdlN2Y0YTc1MjQ4NmRiZDc2ZGExMTkwMmViODVkYjY2MmI1YjI=
|
12
|
-
data.tar.gz: !binary |-
|
13
|
-
MmZlM2IxNjUyMDg3Yjk3YjE5ODQ3OWM4NWY2NzIwNzEwODQ0OWJlMGI3MmQ5
|
14
|
-
ZmJkY2E3NDljYTJiMjhmYzMxYzY2ZTZlMTJmYTAyYjA4NWMxYTdkYmU1ZGUz
|
15
|
-
YzAwMzlhODY0NGQyMzNlMTc5MjQ1OTI0NzEzZmY2NmExMjA0Zjg=
|