apollo-crawler 0.1.14 → 0.1.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/apollo-crawler +3 -0
- data/lib/apollo_crawler.rb +12 -26
- data/lib/apollo_crawler/cache/base_cache.rb +3 -13
- data/lib/apollo_crawler/cache/caches.rb +8 -0
- data/lib/apollo_crawler/cache/memcached_cache.rb +6 -17
- data/lib/apollo_crawler/cache/memory_cache.rb +6 -17
- data/lib/apollo_crawler/cache/mongo_cache.rb +7 -4
- data/lib/apollo_crawler/cache/null_cache.rb +8 -12
- data/lib/apollo_crawler/cache/sqlite_cache.rb +53 -0
- data/lib/apollo_crawler/crawler/base_crawler.rb +3 -4
- data/lib/apollo_crawler/crawler/crawlers.rb +7 -0
- data/lib/apollo_crawler/fetcher/base_fetcher.rb +2 -1
- data/lib/apollo_crawler/fetcher/fetchers.rb +3 -0
- data/lib/apollo_crawler/formatter/formatters.rb +4 -0
- data/lib/apollo_crawler/helper/core_helper.rb +29 -0
- data/lib/apollo_crawler/helper/helpers.rb +1 -0
- data/lib/apollo_crawler/lib.rb +12 -27
- data/lib/apollo_crawler/logger/loggers.rb +2 -0
- data/lib/apollo_crawler/program/base_program.rb +29 -0
- data/lib/apollo_crawler/{program.rb → program/crawler_program.rb} +40 -38
- data/lib/apollo_crawler/program/programs.rb +2 -0
- data/lib/apollo_crawler/store/stores.rb +1 -0
- data/lib/apollo_crawler/version.rb +1 -1
- metadata +123 -78
- checksums.yaml +0 -15
data/bin/apollo-crawler
CHANGED
@@ -21,6 +21,9 @@
|
|
21
21
|
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
22
22
|
# THE SOFTWARE.
|
23
23
|
|
24
|
+
require "rubygems"
|
25
|
+
require "bundler/setup"
|
26
|
+
|
24
27
|
require File.join(File.dirname(__FILE__), "..", "lib", "apollo_crawler")
|
25
28
|
|
26
29
|
Apollo::CrawlerProgram.new.run(ARGV)
|
data/lib/apollo_crawler.rb
CHANGED
@@ -21,40 +21,26 @@
|
|
21
21
|
# TODO: Make this work - DRY!
|
22
22
|
# require File.join(File.dirname(__FILE__), 'apollo_crawler/lib')
|
23
23
|
|
24
|
-
# Main
|
25
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/program')
|
26
|
-
|
27
24
|
# Caches
|
28
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/
|
29
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/factory')
|
30
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/memcached_cache')
|
31
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/memory_cache')
|
32
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/mongo_cache')
|
33
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/null_cache')
|
25
|
+
require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/caches')
|
34
26
|
|
35
27
|
# Crawlers
|
36
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/
|
37
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/google_crawler')
|
38
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/hacker_news_crawler')
|
39
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/slashdot_crawler')
|
40
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/stackoverflow_crawler')
|
41
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/xkcd_crawler')
|
42
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/youjizz_crawler')
|
28
|
+
require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/crawlers')
|
43
29
|
|
44
30
|
# Fetchers
|
45
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/fetcher/
|
46
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/fetcher/simple_fetcher')
|
47
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/fetcher/smart_fetcher')
|
31
|
+
require File.join(File.dirname(__FILE__), 'apollo_crawler/fetcher/fetchers')
|
48
32
|
|
49
33
|
# Formatters
|
50
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/formatter/
|
51
|
-
|
52
|
-
|
53
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/
|
34
|
+
require File.join(File.dirname(__FILE__), 'apollo_crawler/formatter/formatters')
|
35
|
+
|
36
|
+
# Helpers
|
37
|
+
require File.join(File.dirname(__FILE__), 'apollo_crawler/helper/helpers')
|
54
38
|
|
55
39
|
# Loggers
|
56
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/logger/
|
57
|
-
|
40
|
+
require File.join(File.dirname(__FILE__), 'apollo_crawler/logger/loggers')
|
41
|
+
|
42
|
+
# Program
|
43
|
+
require File.join(File.dirname(__FILE__), 'apollo_crawler/program/programs')
|
58
44
|
|
59
45
|
# Stores
|
60
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/store/
|
46
|
+
require File.join(File.dirname(__FILE__), 'apollo_crawler/store/stores')
|
@@ -25,7 +25,7 @@ module Apollo
|
|
25
25
|
end
|
26
26
|
|
27
27
|
# Get value associated with key from cache
|
28
|
-
def
|
28
|
+
def try_get(key, *args)
|
29
29
|
|
30
30
|
# Not found, Create, cache and return
|
31
31
|
if block_given?
|
@@ -41,20 +41,10 @@ module Apollo
|
|
41
41
|
return value
|
42
42
|
end
|
43
43
|
|
44
|
-
|
45
|
-
|
46
|
-
return false
|
44
|
+
def remove(key)
|
45
|
+
# self.set(key, nil)
|
47
46
|
end
|
48
47
|
|
49
|
-
# Invalidate key/value pair
|
50
|
-
def invalidate(key)
|
51
|
-
return true
|
52
|
-
end
|
53
|
-
|
54
|
-
# Clear cache
|
55
|
-
def clear
|
56
|
-
return
|
57
|
-
end
|
58
48
|
end # class BaseCache
|
59
49
|
end # module Cache
|
60
50
|
end # module Apollo
|
@@ -0,0 +1,8 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), 'base_cache')
|
2
|
+
require File.join(File.dirname(__FILE__), 'memcached_cache')
|
3
|
+
require File.join(File.dirname(__FILE__), 'memory_cache')
|
4
|
+
require File.join(File.dirname(__FILE__), 'mongo_cache')
|
5
|
+
require File.join(File.dirname(__FILE__), 'null_cache')
|
6
|
+
require File.join(File.dirname(__FILE__), 'sqlite_cache')
|
7
|
+
|
8
|
+
require File.join(File.dirname(__FILE__), 'factory')
|
@@ -32,9 +32,13 @@ module Apollo
|
|
32
32
|
@cache = Dalli::Client.new()
|
33
33
|
end
|
34
34
|
|
35
|
+
def get(key)
|
36
|
+
@cache.get(key)
|
37
|
+
end
|
38
|
+
|
35
39
|
# Get value associated with key from cache
|
36
|
-
def
|
37
|
-
res =
|
40
|
+
def try_get(key, *args)
|
41
|
+
res = get(key)
|
38
42
|
|
39
43
|
# Not found, Create, cache and return
|
40
44
|
if res.nil? && block_given?
|
@@ -52,21 +56,6 @@ module Apollo
|
|
52
56
|
@cache.set(key, value)
|
53
57
|
return key
|
54
58
|
end
|
55
|
-
|
56
|
-
# Check if cache contains specified key
|
57
|
-
def contains(key)
|
58
|
-
# TODO: Implement
|
59
|
-
end
|
60
|
-
|
61
|
-
# Invalidate key/value pair
|
62
|
-
def invalidate(key)
|
63
|
-
# TODO: Implement
|
64
|
-
end
|
65
|
-
|
66
|
-
# Clear cache
|
67
|
-
def clear
|
68
|
-
# TODO: Implement
|
69
|
-
end
|
70
59
|
end # class MemcachedCache
|
71
60
|
end # module Cache
|
72
61
|
end # module Apollo
|
@@ -29,9 +29,13 @@ module Apollo
|
|
29
29
|
@cache = {}
|
30
30
|
end
|
31
31
|
|
32
|
+
def get(key)
|
33
|
+
@cache[key]
|
34
|
+
end
|
35
|
+
|
32
36
|
# Get value associated with key from cache
|
33
|
-
def
|
34
|
-
res =
|
37
|
+
def try_get(key, *args)
|
38
|
+
res = get(key)
|
35
39
|
|
36
40
|
# Not found, Create, cache and return
|
37
41
|
if res.nil? && block_given?
|
@@ -46,21 +50,6 @@ module Apollo
|
|
46
50
|
def set(key, value)
|
47
51
|
@cache[key] = value
|
48
52
|
end
|
49
|
-
|
50
|
-
# Check if cache contains specified key
|
51
|
-
def contains(key)
|
52
|
-
@cache.has_key?(key)
|
53
|
-
end
|
54
|
-
|
55
|
-
# Invalidate key/value pair
|
56
|
-
def invalidate(key)
|
57
|
-
@cache.delete(key)
|
58
|
-
end
|
59
|
-
|
60
|
-
# Clear cache
|
61
|
-
def clear
|
62
|
-
@cache.clear
|
63
|
-
end
|
64
53
|
end # class MemoryCache
|
65
54
|
end # module Cache
|
66
55
|
end # module Apollo
|
@@ -38,16 +38,19 @@ module Apollo
|
|
38
38
|
super(options)
|
39
39
|
|
40
40
|
opts = @@DEFAULT_OPTIONS.merge(options)
|
41
|
-
|
42
|
-
|
41
|
+
|
43
42
|
@mongo_client = Mongo::MongoClient.new(opts[:host], opts[:port], :pool_size => opts[:pool_size], :pool_timeout => opts[:pool_timeout])
|
44
43
|
@db = @mongo_client[opts[:db]]
|
45
44
|
@coll = @db[opts[:collection]]
|
46
45
|
end
|
47
46
|
|
47
|
+
def get(key)
|
48
|
+
@coll.find({:url => key})
|
49
|
+
end
|
50
|
+
|
48
51
|
# Get value associated with key from cache
|
49
|
-
def
|
50
|
-
res =
|
52
|
+
def try_get(key, *args)
|
53
|
+
res = get(key)
|
51
54
|
|
52
55
|
# Not found, Create, cache and return
|
53
56
|
if res.nil? || res.count < 1 && block_given?
|
@@ -27,10 +27,16 @@ module Apollo
|
|
27
27
|
super(options)
|
28
28
|
end
|
29
29
|
|
30
|
+
def get(key)
|
31
|
+
nil
|
32
|
+
end
|
33
|
+
|
30
34
|
# Get value associated with key from cache
|
31
|
-
def
|
35
|
+
def try_get(key, *args)
|
36
|
+
res = get(key)
|
37
|
+
|
32
38
|
# Not found, Create, cache and return
|
33
|
-
if block_given?
|
39
|
+
if res.nil? && block_given?
|
34
40
|
res = yield args
|
35
41
|
end
|
36
42
|
|
@@ -42,16 +48,6 @@ module Apollo
|
|
42
48
|
def set(key, value)
|
43
49
|
return value
|
44
50
|
end
|
45
|
-
|
46
|
-
# Check if cache contains specified key
|
47
|
-
def contains(key)
|
48
|
-
return false
|
49
|
-
end
|
50
|
-
|
51
|
-
# Invalidate key/value pair
|
52
|
-
def invalidate(key)
|
53
|
-
return true
|
54
|
-
end
|
55
51
|
end # NullCache
|
56
52
|
end # Cache
|
57
53
|
end # Apollo
|
@@ -0,0 +1,53 @@
|
|
1
|
+
# Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
|
2
|
+
#
|
3
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
# of this software and associated documentation files (the "Software"), to deal
|
5
|
+
# in the Software without restriction, including without limitation the rights
|
6
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
# copies of the Software, and to permit persons to whom the Software is
|
8
|
+
# furnished to do so, subject to the following conditions:
|
9
|
+
#
|
10
|
+
# The above copyright notice and this permission notice shall be included in
|
11
|
+
# all copies or substantial portions of the Software.
|
12
|
+
#
|
13
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
+
# THE SOFTWARE.
|
20
|
+
|
21
|
+
require File.join(File.dirname(__FILE__), 'base_cache')
|
22
|
+
|
23
|
+
module Apollo
|
24
|
+
module Cache
|
25
|
+
class SqliteCache < BaseCache
|
26
|
+
def initilize(options = {})
|
27
|
+
super(options)
|
28
|
+
end
|
29
|
+
|
30
|
+
def get(key)
|
31
|
+
return nil
|
32
|
+
end
|
33
|
+
|
34
|
+
# Get value associated with key from cache
|
35
|
+
def try_get(key, *args)
|
36
|
+
res = get(key)
|
37
|
+
|
38
|
+
# Not found, Create, cache and return
|
39
|
+
if res.nil? && block_given?
|
40
|
+
res = yield args
|
41
|
+
end
|
42
|
+
|
43
|
+
return res
|
44
|
+
end
|
45
|
+
|
46
|
+
# Set value associated with key
|
47
|
+
# Return cached value
|
48
|
+
def set(key, value)
|
49
|
+
return value
|
50
|
+
end
|
51
|
+
end # SqliteCache
|
52
|
+
end # Cache
|
53
|
+
end # Apollo
|
@@ -112,7 +112,7 @@ module Apollo
|
|
112
112
|
# Add document to queue of results
|
113
113
|
res << doc
|
114
114
|
|
115
|
-
enqueue_url(doc[:links]) if doc[:links]
|
115
|
+
enqueue_url(doc[:links].map(){ |l| l[:link] }) if doc[:links]
|
116
116
|
end
|
117
117
|
|
118
118
|
# Break if limit of documents to processed was reached
|
@@ -130,10 +130,9 @@ module Apollo
|
|
130
130
|
def enqueue_url(url)
|
131
131
|
urls = []
|
132
132
|
return urls if url.nil?
|
133
|
-
|
134
133
|
# We support both - list of urls or single url
|
135
134
|
if(url.kind_of?(Array))
|
136
|
-
urls.concat(url)
|
135
|
+
urls = urls.concat(url)
|
137
136
|
else
|
138
137
|
urls << url
|
139
138
|
end
|
@@ -194,7 +193,7 @@ module Apollo
|
|
194
193
|
|
195
194
|
# TODO: Use some (custom-made) low-level HTTTP Protocol cache - just for sure
|
196
195
|
cache = Apollo::Cache::Factory.instance.construct
|
197
|
-
metadoc = cache.
|
196
|
+
metadoc = cache.try_get(url) do
|
198
197
|
max_attempts = 3
|
199
198
|
attempt_no = 0
|
200
199
|
success = false
|
@@ -0,0 +1,7 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), 'base_crawler')
|
2
|
+
require File.join(File.dirname(__FILE__), 'google_crawler')
|
3
|
+
require File.join(File.dirname(__FILE__), 'hacker_news_crawler')
|
4
|
+
require File.join(File.dirname(__FILE__), 'slashdot_crawler')
|
5
|
+
require File.join(File.dirname(__FILE__), 'stackoverflow_crawler')
|
6
|
+
require File.join(File.dirname(__FILE__), 'xkcd_crawler')
|
7
|
+
require File.join(File.dirname(__FILE__), 'youjizz_crawler')
|
@@ -21,11 +21,12 @@
|
|
21
21
|
require "open-uri"
|
22
22
|
require "nokogiri"
|
23
23
|
|
24
|
+
require "em-http-request"
|
25
|
+
|
24
26
|
module Apollo
|
25
27
|
module Fetcher
|
26
28
|
class BaseFetcher
|
27
29
|
def self.fetch(url)
|
28
|
-
# TODO: Throw exception ???
|
29
30
|
return open(url).read
|
30
31
|
end
|
31
32
|
end # class BaseFetcher
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
|
2
|
+
#
|
3
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
# of this software and associated documentation files (the "Software"), to deal
|
5
|
+
# in the Software without restriction, including without limitation the rights
|
6
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
# copies of the Software, and to permit persons to whom the Software is
|
8
|
+
# furnished to do so, subject to the following conditions:
|
9
|
+
#
|
10
|
+
# The above copyright notice and this permission notice shall be included in
|
11
|
+
# all copies or substantial portions of the Software.
|
12
|
+
#
|
13
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
+
# THE SOFTWARE.
|
20
|
+
|
21
|
+
module Apollo
|
22
|
+
module Helper
|
23
|
+
class CoreHelper
|
24
|
+
def self.name_re()
|
25
|
+
return /formatter$/
|
26
|
+
end
|
27
|
+
end # class CoreHelper
|
28
|
+
end # module Helper
|
29
|
+
end # module Apollo
|
@@ -0,0 +1 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), 'core_helper')
|
data/lib/apollo_crawler/lib.rb
CHANGED
@@ -18,40 +18,25 @@
|
|
18
18
|
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
19
|
# THE SOFTWARE.
|
20
20
|
|
21
|
-
# Main
|
22
|
-
require File.join(File.dirname(__FILE__), 'program')
|
23
|
-
|
24
21
|
# Caches
|
25
|
-
require File.join(File.dirname(__FILE__), 'cache/
|
26
|
-
require File.join(File.dirname(__FILE__), 'cache/factory')
|
27
|
-
require File.join(File.dirname(__FILE__), 'cache/memcached_cache')
|
28
|
-
require File.join(File.dirname(__FILE__), 'cache/memory_cache')
|
29
|
-
require File.join(File.dirname(__FILE__), 'cache/mongo_cache')
|
30
|
-
require File.join(File.dirname(__FILE__), 'cache/null_cache')
|
22
|
+
require File.join(File.dirname(__FILE__), 'cache/caches')
|
31
23
|
|
32
24
|
# Crawlers
|
33
|
-
require File.join(File.dirname(__FILE__), 'crawler/
|
34
|
-
require File.join(File.dirname(__FILE__), 'crawler/google_crawler')
|
35
|
-
require File.join(File.dirname(__FILE__), 'crawler/hacker_news_crawler')
|
36
|
-
require File.join(File.dirname(__FILE__), 'crawler/slashdot_crawler')
|
37
|
-
require File.join(File.dirname(__FILE__), 'crawler/stackoverflow_crawler')
|
38
|
-
require File.join(File.dirname(__FILE__), 'crawler/xkcd_crawler')
|
39
|
-
require File.join(File.dirname(__FILE__), 'crawler/youjizz_crawler')
|
25
|
+
require File.join(File.dirname(__FILE__), 'crawler/crawlers')
|
40
26
|
|
41
27
|
# Fetchers
|
42
|
-
require File.join(File.dirname(__FILE__), 'fetcher/
|
43
|
-
require File.join(File.dirname(__FILE__), 'fetcher/simple_fetcher')
|
44
|
-
require File.join(File.dirname(__FILE__), 'fetcher/smart_fetcher')
|
45
|
-
|
28
|
+
require File.join(File.dirname(__FILE__), 'fetcher/fetchers')
|
46
29
|
# Formatters
|
47
|
-
require File.join(File.dirname(__FILE__), 'formatter/
|
48
|
-
|
49
|
-
|
50
|
-
require File.join(File.dirname(__FILE__), '
|
30
|
+
require File.join(File.dirname(__FILE__), 'formatter/formatters')
|
31
|
+
|
32
|
+
# Helpers
|
33
|
+
require File.join(File.dirname(__FILE__), 'helper/helpers')
|
51
34
|
|
52
35
|
# Loggers
|
53
|
-
require File.join(File.dirname(__FILE__), 'logger/
|
54
|
-
|
36
|
+
require File.join(File.dirname(__FILE__), 'logger/loggers')
|
37
|
+
|
38
|
+
# Programs
|
39
|
+
require File.join(File.dirname(__FILE__), 'program/programs')
|
55
40
|
|
56
41
|
# Stores
|
57
|
-
require File.join(File.dirname(__FILE__), 'store/
|
42
|
+
require File.join(File.dirname(__FILE__), 'store/stores')
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
|
2
|
+
#
|
3
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
# of this software and associated documentation files (the "Software"), to deal
|
5
|
+
# in the Software without restriction, including without limitation the rights
|
6
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
# copies of the Software, and to permit persons to whom the Software is
|
8
|
+
# furnished to do so, subject to the following conditions:
|
9
|
+
#
|
10
|
+
# The above copyright notice and this permission notice shall be included in
|
11
|
+
# all copies or substantial portions of the Software.
|
12
|
+
#
|
13
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
+
# THE SOFTWARE.
|
20
|
+
|
21
|
+
module Apollo
|
22
|
+
class BaseProgram
|
23
|
+
def self.require_files(files = [])
|
24
|
+
Dir.glob(files).each do |file|
|
25
|
+
require file
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end # class BaseProgram
|
29
|
+
end # module Apollo
|
@@ -18,9 +18,6 @@
|
|
18
18
|
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
19
|
# THE SOFTWARE.
|
20
20
|
|
21
|
-
require "rubygems"
|
22
|
-
require "bundler/setup"
|
23
|
-
|
24
21
|
require 'json'
|
25
22
|
|
26
23
|
require "thor"
|
@@ -39,15 +36,17 @@ require 'terminal-table'
|
|
39
36
|
require 'eventmachine'
|
40
37
|
require 'em-http'
|
41
38
|
|
42
|
-
require File.join(File.dirname(__FILE__), 'version')
|
39
|
+
require File.join(File.dirname(__FILE__), '..', 'version')
|
43
40
|
|
44
41
|
# require File.join(File.dirname(__FILE__), 'config/crawler')
|
45
42
|
# puts Apollo::CrawlerProgramConfig
|
46
43
|
|
44
|
+
require File.join(File.dirname(__FILE__),'base_program')
|
45
|
+
|
47
46
|
module Apollo
|
48
|
-
class CrawlerProgram
|
47
|
+
class CrawlerProgram < BaseProgram
|
49
48
|
# Load default config
|
50
|
-
require File.join(File.dirname(__FILE__), "config")
|
49
|
+
require File.join(File.dirname(__FILE__), "..", "config")
|
51
50
|
|
52
51
|
# This hash will hold all of the options
|
53
52
|
# parsed from the command-line by OptionParser.
|
@@ -235,10 +234,7 @@ module Apollo
|
|
235
234
|
puts "Registering caches - '#{dir}'"
|
236
235
|
end
|
237
236
|
|
238
|
-
|
239
|
-
Dir.glob(files).each do |file|
|
240
|
-
require file
|
241
|
-
end
|
237
|
+
BaseProgram.require_files(File.join(dir, "**", "*.rb"))
|
242
238
|
|
243
239
|
tmp = Apollo::Cache.constants.select { |c|
|
244
240
|
Class === Apollo::Cache.const_get(c)
|
@@ -269,11 +265,8 @@ module Apollo
|
|
269
265
|
puts "Registering crawlers - '#{dir}'"
|
270
266
|
end
|
271
267
|
|
272
|
-
|
273
|
-
|
274
|
-
require file
|
275
|
-
end
|
276
|
-
|
268
|
+
BaseProgram.require_files(File.join(dir, "**", "*.rb"))
|
269
|
+
|
277
270
|
tmp = Apollo::Crawler.constants.select { |c|
|
278
271
|
Class === Apollo::Crawler.const_get(c)
|
279
272
|
}
|
@@ -303,10 +296,7 @@ module Apollo
|
|
303
296
|
puts "Registering formatters - '#{dir}'"
|
304
297
|
end
|
305
298
|
|
306
|
-
|
307
|
-
Dir.glob(files).each do |file|
|
308
|
-
require file
|
309
|
-
end
|
299
|
+
BaseProgram.require_files(File.join(dir, "**", "*.rb"))
|
310
300
|
|
311
301
|
tmp = Apollo::Formatter.constants.select { |c|
|
312
302
|
Class === Apollo::Formatter.const_get(c)
|
@@ -348,7 +338,7 @@ module Apollo
|
|
348
338
|
end
|
349
339
|
end
|
350
340
|
|
351
|
-
def generate_crawler(name, url = nil, matcher = nil)
|
341
|
+
def generate_crawler(name, url = nil, matcher = nil, options = @options)
|
352
342
|
name = name.titleize.gsub(" ", "")
|
353
343
|
|
354
344
|
if(@options[:verbose])
|
@@ -362,11 +352,13 @@ module Apollo
|
|
362
352
|
return -1
|
363
353
|
end
|
364
354
|
|
365
|
-
if(
|
355
|
+
if(options[:verbose])
|
366
356
|
puts "Using template '#{template_path}'"
|
367
357
|
end
|
368
358
|
|
369
|
-
|
359
|
+
unless(options[:silent])
|
360
|
+
dest_path = File.join(Dir.pwd, "#{name.underscore}.rb")
|
361
|
+
end
|
370
362
|
|
371
363
|
url = url ? url : "http://some-url-here"
|
372
364
|
matcher = matcher ? matcher : "//a"
|
@@ -414,23 +406,31 @@ module Apollo
|
|
414
406
|
return
|
415
407
|
end
|
416
408
|
|
417
|
-
def
|
409
|
+
def get_crawlers_by_name(crawlers, crawler_classes = @crawlers)
|
410
|
+
res = []
|
411
|
+
|
418
412
|
crawlers.each do |name|
|
419
413
|
crawler_name = name.downcase.gsub(Apollo::Crawler::BaseCrawler.name_re, "")
|
420
414
|
|
421
|
-
crawler =
|
415
|
+
crawler = crawler_classes[crawler_name]
|
422
416
|
if(crawler == nil)
|
423
|
-
|
424
|
-
puts "See program help"
|
425
|
-
return 0
|
417
|
+
next
|
426
418
|
end
|
427
419
|
|
428
|
-
|
420
|
+
res << crawler
|
421
|
+
end
|
422
|
+
|
423
|
+
return res
|
424
|
+
end
|
425
|
+
|
426
|
+
def run_crawlers(crawlers, args, options = @options)
|
427
|
+
crawlers.each do |crawler|
|
428
|
+
if(options[:verbose])
|
429
429
|
puts "Running '#{crawler}'"
|
430
430
|
end
|
431
431
|
|
432
432
|
opts = {
|
433
|
-
:doc_limit =>
|
433
|
+
:doc_limit => options[:doc_limit]
|
434
434
|
}
|
435
435
|
|
436
436
|
res = crawler.new.etl(args, opts) { | docs |
|
@@ -442,23 +442,23 @@ module Apollo
|
|
442
442
|
end
|
443
443
|
|
444
444
|
# Get crawlers passd to cmd-line
|
445
|
-
def get_crawlers(args,
|
445
|
+
def get_crawlers(args, options = @options)
|
446
446
|
crawlers = []
|
447
447
|
if(args.length > 0)
|
448
448
|
crawlers << args.shift
|
449
449
|
end
|
450
450
|
|
451
|
-
if(
|
451
|
+
if(options[:run_all])
|
452
452
|
crawlers = @crawlers.keys
|
453
453
|
end
|
454
454
|
|
455
455
|
return crawlers
|
456
456
|
end
|
457
457
|
|
458
|
-
def init_program_directory(base_dir = RbConfig::PROGRAM_DIRECTORY, dirs = RbConfig::PROGRAM_DIRECTORIES,
|
458
|
+
def init_program_directory(base_dir = RbConfig::PROGRAM_DIRECTORY, dirs = RbConfig::PROGRAM_DIRECTORIES, options = @options)
|
459
459
|
dirs.each do |dir|
|
460
460
|
if(File.directory?(dir) == false)
|
461
|
-
if(
|
461
|
+
if(options[:verbose])
|
462
462
|
puts "Creating '#{dir}'"
|
463
463
|
end
|
464
464
|
|
@@ -469,10 +469,10 @@ module Apollo
|
|
469
469
|
init_user_config_file(File.join(File.dirname(__FILE__), 'config_user.trb'), File.join(base_dir, 'config.rb'))
|
470
470
|
end
|
471
471
|
|
472
|
-
def init_user_config_file(config_path, dest_path,
|
472
|
+
def init_user_config_file(config_path, dest_path, options = @options)
|
473
473
|
# Create user config file
|
474
474
|
if(File.exists?(config_path) && File.exists?(dest_path) == false)
|
475
|
-
if(
|
475
|
+
if(options[:verbose])
|
476
476
|
puts "Creating user config file '#{config_path}' => '#{dest_path}'"
|
477
477
|
end
|
478
478
|
|
@@ -511,13 +511,15 @@ module Apollo
|
|
511
511
|
return request_exit(res_code)
|
512
512
|
end
|
513
513
|
|
514
|
-
|
515
|
-
if(
|
514
|
+
crawler_names = get_crawlers(args)
|
515
|
+
if(crawler_names.empty?)
|
516
516
|
puts @optparser
|
517
517
|
return request_exit(0)
|
518
518
|
end
|
519
519
|
|
520
|
-
|
520
|
+
crawlers = get_crawlers_by_name(crawler_names, @crawlers)
|
521
|
+
|
522
|
+
res_code = run_crawlers(crawlers, args, @options)
|
521
523
|
return request_exit(res_code)
|
522
524
|
end
|
523
525
|
|
@@ -0,0 +1 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), 'base_store')
|
metadata
CHANGED
@@ -1,7 +1,8 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: apollo-crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.15
|
5
|
+
prerelease:
|
5
6
|
platform: ruby
|
6
7
|
authors:
|
7
8
|
- Tomas Korcak
|
@@ -10,51 +11,42 @@ bindir: bin
|
|
10
11
|
cert_chain: []
|
11
12
|
date: 2013-03-03 00:00:00.000000000 Z
|
12
13
|
dependencies:
|
13
|
-
- !ruby/object:Gem::Dependency
|
14
|
-
name: amqp
|
15
|
-
requirement: !ruby/object:Gem::Requirement
|
16
|
-
requirements:
|
17
|
-
- - ! '>='
|
18
|
-
- !ruby/object:Gem::Version
|
19
|
-
version: '0'
|
20
|
-
type: :runtime
|
21
|
-
prerelease: false
|
22
|
-
version_requirements: !ruby/object:Gem::Requirement
|
23
|
-
requirements:
|
24
|
-
- - ! '>='
|
25
|
-
- !ruby/object:Gem::Version
|
26
|
-
version: '0'
|
27
14
|
- !ruby/object:Gem::Dependency
|
28
15
|
name: awesome_print
|
29
16
|
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
30
18
|
requirements:
|
31
|
-
- -
|
19
|
+
- - ~>
|
32
20
|
- !ruby/object:Gem::Version
|
33
|
-
version:
|
21
|
+
version: 1.1.0
|
34
22
|
type: :runtime
|
35
23
|
prerelease: false
|
36
24
|
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
37
26
|
requirements:
|
38
|
-
- -
|
27
|
+
- - ~>
|
39
28
|
- !ruby/object:Gem::Version
|
40
|
-
version:
|
29
|
+
version: 1.1.0
|
41
30
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
31
|
+
name: activesupport
|
43
32
|
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
44
34
|
requirements:
|
45
35
|
- - ! '>='
|
46
36
|
- !ruby/object:Gem::Version
|
47
|
-
version:
|
37
|
+
version: 3.2.12
|
48
38
|
type: :runtime
|
49
39
|
prerelease: false
|
50
40
|
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
51
42
|
requirements:
|
52
43
|
- - ! '>='
|
53
44
|
- !ruby/object:Gem::Version
|
54
|
-
version:
|
45
|
+
version: 3.2.12
|
55
46
|
- !ruby/object:Gem::Dependency
|
56
47
|
name: dalli
|
57
48
|
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
58
50
|
requirements:
|
59
51
|
- - ! '>='
|
60
52
|
- !ruby/object:Gem::Version
|
@@ -62,6 +54,7 @@ dependencies:
|
|
62
54
|
type: :runtime
|
63
55
|
prerelease: false
|
64
56
|
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
65
58
|
requirements:
|
66
59
|
- - ! '>='
|
67
60
|
- !ruby/object:Gem::Version
|
@@ -69,6 +62,7 @@ dependencies:
|
|
69
62
|
- !ruby/object:Gem::Dependency
|
70
63
|
name: fastercsv
|
71
64
|
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
72
66
|
requirements:
|
73
67
|
- - ~>
|
74
68
|
- !ruby/object:Gem::Version
|
@@ -76,13 +70,15 @@ dependencies:
|
|
76
70
|
type: :runtime
|
77
71
|
prerelease: false
|
78
72
|
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
79
74
|
requirements:
|
80
75
|
- - ~>
|
81
76
|
- !ruby/object:Gem::Version
|
82
77
|
version: 1.5.5
|
83
78
|
- !ruby/object:Gem::Dependency
|
84
|
-
name:
|
79
|
+
name: eventmachine
|
85
80
|
requirement: !ruby/object:Gem::Requirement
|
81
|
+
none: false
|
86
82
|
requirements:
|
87
83
|
- - ! '>='
|
88
84
|
- !ruby/object:Gem::Version
|
@@ -90,6 +86,7 @@ dependencies:
|
|
90
86
|
type: :runtime
|
91
87
|
prerelease: false
|
92
88
|
version_requirements: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
93
90
|
requirements:
|
94
91
|
- - ! '>='
|
95
92
|
- !ruby/object:Gem::Version
|
@@ -97,6 +94,7 @@ dependencies:
|
|
97
94
|
- !ruby/object:Gem::Dependency
|
98
95
|
name: em-http-request
|
99
96
|
requirement: !ruby/object:Gem::Requirement
|
97
|
+
none: false
|
100
98
|
requirements:
|
101
99
|
- - ! '>='
|
102
100
|
- !ruby/object:Gem::Version
|
@@ -104,6 +102,7 @@ dependencies:
|
|
104
102
|
type: :runtime
|
105
103
|
prerelease: false
|
106
104
|
version_requirements: !ruby/object:Gem::Requirement
|
105
|
+
none: false
|
107
106
|
requirements:
|
108
107
|
- - ! '>='
|
109
108
|
- !ruby/object:Gem::Version
|
@@ -111,6 +110,7 @@ dependencies:
|
|
111
110
|
- !ruby/object:Gem::Dependency
|
112
111
|
name: em-synchrony
|
113
112
|
requirement: !ruby/object:Gem::Requirement
|
113
|
+
none: false
|
114
114
|
requirements:
|
115
115
|
- - ! '>='
|
116
116
|
- !ruby/object:Gem::Version
|
@@ -118,41 +118,47 @@ dependencies:
|
|
118
118
|
type: :runtime
|
119
119
|
prerelease: false
|
120
120
|
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
none: false
|
121
122
|
requirements:
|
122
123
|
- - ! '>='
|
123
124
|
- !ruby/object:Gem::Version
|
124
125
|
version: '0'
|
125
126
|
- !ruby/object:Gem::Dependency
|
126
|
-
name:
|
127
|
+
name: amqp
|
127
128
|
requirement: !ruby/object:Gem::Requirement
|
129
|
+
none: false
|
128
130
|
requirements:
|
129
|
-
- -
|
131
|
+
- - ~>
|
130
132
|
- !ruby/object:Gem::Version
|
131
|
-
version:
|
133
|
+
version: 0.9.9
|
132
134
|
type: :runtime
|
133
135
|
prerelease: false
|
134
136
|
version_requirements: !ruby/object:Gem::Requirement
|
137
|
+
none: false
|
135
138
|
requirements:
|
136
|
-
- -
|
139
|
+
- - ~>
|
137
140
|
- !ruby/object:Gem::Version
|
138
|
-
version:
|
141
|
+
version: 0.9.9
|
139
142
|
- !ruby/object:Gem::Dependency
|
140
|
-
name:
|
143
|
+
name: json
|
141
144
|
requirement: !ruby/object:Gem::Requirement
|
145
|
+
none: false
|
142
146
|
requirements:
|
143
|
-
- -
|
147
|
+
- - ~>
|
144
148
|
- !ruby/object:Gem::Version
|
145
|
-
version:
|
149
|
+
version: 1.7.1
|
146
150
|
type: :runtime
|
147
151
|
prerelease: false
|
148
152
|
version_requirements: !ruby/object:Gem::Requirement
|
153
|
+
none: false
|
149
154
|
requirements:
|
150
|
-
- -
|
155
|
+
- - ~>
|
151
156
|
- !ruby/object:Gem::Version
|
152
|
-
version:
|
157
|
+
version: 1.7.1
|
153
158
|
- !ruby/object:Gem::Dependency
|
154
|
-
name:
|
159
|
+
name: memcache-client
|
155
160
|
requirement: !ruby/object:Gem::Requirement
|
161
|
+
none: false
|
156
162
|
requirements:
|
157
163
|
- - ! '>='
|
158
164
|
- !ruby/object:Gem::Version
|
@@ -160,13 +166,15 @@ dependencies:
|
|
160
166
|
type: :runtime
|
161
167
|
prerelease: false
|
162
168
|
version_requirements: !ruby/object:Gem::Requirement
|
169
|
+
none: false
|
163
170
|
requirements:
|
164
171
|
- - ! '>='
|
165
172
|
- !ruby/object:Gem::Version
|
166
173
|
version: '0'
|
167
174
|
- !ruby/object:Gem::Dependency
|
168
|
-
name:
|
175
|
+
name: mongo
|
169
176
|
requirement: !ruby/object:Gem::Requirement
|
177
|
+
none: false
|
170
178
|
requirements:
|
171
179
|
- - ! '>='
|
172
180
|
- !ruby/object:Gem::Version
|
@@ -174,13 +182,15 @@ dependencies:
|
|
174
182
|
type: :runtime
|
175
183
|
prerelease: false
|
176
184
|
version_requirements: !ruby/object:Gem::Requirement
|
185
|
+
none: false
|
177
186
|
requirements:
|
178
187
|
- - ! '>='
|
179
188
|
- !ruby/object:Gem::Version
|
180
189
|
version: '0'
|
181
190
|
- !ruby/object:Gem::Dependency
|
182
|
-
name:
|
191
|
+
name: mongoid
|
183
192
|
requirement: !ruby/object:Gem::Requirement
|
193
|
+
none: false
|
184
194
|
requirements:
|
185
195
|
- - ! '>='
|
186
196
|
- !ruby/object:Gem::Version
|
@@ -188,13 +198,15 @@ dependencies:
|
|
188
198
|
type: :runtime
|
189
199
|
prerelease: false
|
190
200
|
version_requirements: !ruby/object:Gem::Requirement
|
201
|
+
none: false
|
191
202
|
requirements:
|
192
203
|
- - ! '>='
|
193
204
|
- !ruby/object:Gem::Version
|
194
205
|
version: '0'
|
195
206
|
- !ruby/object:Gem::Dependency
|
196
|
-
name:
|
207
|
+
name: mime-types
|
197
208
|
requirement: !ruby/object:Gem::Requirement
|
209
|
+
none: false
|
198
210
|
requirements:
|
199
211
|
- - ! '>='
|
200
212
|
- !ruby/object:Gem::Version
|
@@ -202,27 +214,31 @@ dependencies:
|
|
202
214
|
type: :runtime
|
203
215
|
prerelease: false
|
204
216
|
version_requirements: !ruby/object:Gem::Requirement
|
217
|
+
none: false
|
205
218
|
requirements:
|
206
219
|
- - ! '>='
|
207
220
|
- !ruby/object:Gem::Version
|
208
221
|
version: '0'
|
209
222
|
- !ruby/object:Gem::Dependency
|
210
|
-
name:
|
223
|
+
name: nokogiri
|
211
224
|
requirement: !ruby/object:Gem::Requirement
|
225
|
+
none: false
|
212
226
|
requirements:
|
213
|
-
- -
|
227
|
+
- - ~>
|
214
228
|
- !ruby/object:Gem::Version
|
215
|
-
version:
|
229
|
+
version: 1.5.6
|
216
230
|
type: :runtime
|
217
231
|
prerelease: false
|
218
232
|
version_requirements: !ruby/object:Gem::Requirement
|
233
|
+
none: false
|
219
234
|
requirements:
|
220
|
-
- -
|
235
|
+
- - ~>
|
221
236
|
- !ruby/object:Gem::Version
|
222
|
-
version:
|
237
|
+
version: 1.5.6
|
223
238
|
- !ruby/object:Gem::Dependency
|
224
|
-
name:
|
239
|
+
name: openurl
|
225
240
|
requirement: !ruby/object:Gem::Requirement
|
241
|
+
none: false
|
226
242
|
requirements:
|
227
243
|
- - ! '>='
|
228
244
|
- !ruby/object:Gem::Version
|
@@ -230,13 +246,15 @@ dependencies:
|
|
230
246
|
type: :runtime
|
231
247
|
prerelease: false
|
232
248
|
version_requirements: !ruby/object:Gem::Requirement
|
249
|
+
none: false
|
233
250
|
requirements:
|
234
251
|
- - ! '>='
|
235
252
|
- !ruby/object:Gem::Version
|
236
253
|
version: '0'
|
237
254
|
- !ruby/object:Gem::Dependency
|
238
|
-
name:
|
255
|
+
name: parallel
|
239
256
|
requirement: !ruby/object:Gem::Requirement
|
257
|
+
none: false
|
240
258
|
requirements:
|
241
259
|
- - ! '>='
|
242
260
|
- !ruby/object:Gem::Version
|
@@ -244,69 +262,79 @@ dependencies:
|
|
244
262
|
type: :runtime
|
245
263
|
prerelease: false
|
246
264
|
version_requirements: !ruby/object:Gem::Requirement
|
265
|
+
none: false
|
247
266
|
requirements:
|
248
267
|
- - ! '>='
|
249
268
|
- !ruby/object:Gem::Version
|
250
269
|
version: '0'
|
251
270
|
- !ruby/object:Gem::Dependency
|
252
|
-
name:
|
271
|
+
name: rack
|
253
272
|
requirement: !ruby/object:Gem::Requirement
|
273
|
+
none: false
|
254
274
|
requirements:
|
255
275
|
- - ! '>='
|
256
276
|
- !ruby/object:Gem::Version
|
257
|
-
version:
|
277
|
+
version: 1.5.2
|
258
278
|
type: :runtime
|
259
279
|
prerelease: false
|
260
280
|
version_requirements: !ruby/object:Gem::Requirement
|
281
|
+
none: false
|
261
282
|
requirements:
|
262
283
|
- - ! '>='
|
263
284
|
- !ruby/object:Gem::Version
|
264
|
-
version:
|
285
|
+
version: 1.5.2
|
265
286
|
- !ruby/object:Gem::Dependency
|
266
287
|
name: terminal-table
|
267
288
|
requirement: !ruby/object:Gem::Requirement
|
289
|
+
none: false
|
268
290
|
requirements:
|
269
|
-
- -
|
291
|
+
- - ~>
|
270
292
|
- !ruby/object:Gem::Version
|
271
|
-
version:
|
293
|
+
version: 1.4.5
|
272
294
|
type: :runtime
|
273
295
|
prerelease: false
|
274
296
|
version_requirements: !ruby/object:Gem::Requirement
|
297
|
+
none: false
|
275
298
|
requirements:
|
276
|
-
- -
|
299
|
+
- - ~>
|
277
300
|
- !ruby/object:Gem::Version
|
278
|
-
version:
|
301
|
+
version: 1.4.5
|
279
302
|
- !ruby/object:Gem::Dependency
|
280
303
|
name: thor
|
281
304
|
requirement: !ruby/object:Gem::Requirement
|
305
|
+
none: false
|
282
306
|
requirements:
|
283
|
-
- -
|
307
|
+
- - ~>
|
284
308
|
- !ruby/object:Gem::Version
|
285
|
-
version:
|
309
|
+
version: 0.17.0
|
286
310
|
type: :runtime
|
287
311
|
prerelease: false
|
288
312
|
version_requirements: !ruby/object:Gem::Requirement
|
313
|
+
none: false
|
289
314
|
requirements:
|
290
|
-
- -
|
315
|
+
- - ~>
|
291
316
|
- !ruby/object:Gem::Version
|
292
|
-
version:
|
317
|
+
version: 0.17.0
|
293
318
|
- !ruby/object:Gem::Dependency
|
294
|
-
name:
|
319
|
+
name: writeexcel
|
295
320
|
requirement: !ruby/object:Gem::Requirement
|
321
|
+
none: false
|
296
322
|
requirements:
|
297
|
-
- -
|
323
|
+
- - ~>
|
298
324
|
- !ruby/object:Gem::Version
|
299
|
-
version:
|
325
|
+
version: 0.6.18
|
300
326
|
type: :runtime
|
301
327
|
prerelease: false
|
302
328
|
version_requirements: !ruby/object:Gem::Requirement
|
329
|
+
none: false
|
303
330
|
requirements:
|
304
|
-
- -
|
331
|
+
- - ~>
|
305
332
|
- !ruby/object:Gem::Version
|
306
|
-
version:
|
333
|
+
version: 0.6.18
|
307
334
|
- !ruby/object:Gem::Dependency
|
308
335
|
name: guard
|
309
336
|
requirement: !ruby/object:Gem::Requirement
|
337
|
+
none: false
|
310
338
|
requirements:
|
311
339
|
- - ! '>='
|
312
340
|
- !ruby/object:Gem::Version
|
@@ -314,6 +342,7 @@ dependencies:
|
|
314
342
|
type: :development
|
315
343
|
prerelease: false
|
316
344
|
version_requirements: !ruby/object:Gem::Requirement
|
345
|
+
none: false
|
317
346
|
requirements:
|
318
347
|
- - ! '>='
|
319
348
|
- !ruby/object:Gem::Version
|
@@ -321,6 +350,7 @@ dependencies:
|
|
321
350
|
- !ruby/object:Gem::Dependency
|
322
351
|
name: guard-rake
|
323
352
|
requirement: !ruby/object:Gem::Requirement
|
353
|
+
none: false
|
324
354
|
requirements:
|
325
355
|
- - ! '>='
|
326
356
|
- !ruby/object:Gem::Version
|
@@ -328,6 +358,7 @@ dependencies:
|
|
328
358
|
type: :development
|
329
359
|
prerelease: false
|
330
360
|
version_requirements: !ruby/object:Gem::Requirement
|
361
|
+
none: false
|
331
362
|
requirements:
|
332
363
|
- - ! '>='
|
333
364
|
- !ruby/object:Gem::Version
|
@@ -335,6 +366,7 @@ dependencies:
|
|
335
366
|
- !ruby/object:Gem::Dependency
|
336
367
|
name: guard-rspec
|
337
368
|
requirement: !ruby/object:Gem::Requirement
|
369
|
+
none: false
|
338
370
|
requirements:
|
339
371
|
- - ! '>='
|
340
372
|
- !ruby/object:Gem::Version
|
@@ -342,6 +374,7 @@ dependencies:
|
|
342
374
|
type: :development
|
343
375
|
prerelease: false
|
344
376
|
version_requirements: !ruby/object:Gem::Requirement
|
377
|
+
none: false
|
345
378
|
requirements:
|
346
379
|
- - ! '>='
|
347
380
|
- !ruby/object:Gem::Version
|
@@ -353,57 +386,69 @@ executables:
|
|
353
386
|
extensions: []
|
354
387
|
extra_rdoc_files: []
|
355
388
|
files:
|
389
|
+
- ./lib/apollo_crawler/lib.rb
|
390
|
+
- ./lib/apollo_crawler/program/base_program.rb
|
391
|
+
- ./lib/apollo_crawler/program/programs.rb
|
392
|
+
- ./lib/apollo_crawler/program/crawler_program.rb
|
356
393
|
- ./lib/apollo_crawler/fetcher/smart_fetcher.rb
|
357
394
|
- ./lib/apollo_crawler/fetcher/simple_fetcher.rb
|
358
395
|
- ./lib/apollo_crawler/fetcher/base_fetcher.rb
|
359
|
-
- ./lib/apollo_crawler/
|
396
|
+
- ./lib/apollo_crawler/fetcher/fetchers.rb
|
360
397
|
- ./lib/apollo_crawler/version.rb
|
361
|
-
- ./lib/apollo_crawler/logger/console_logger.rb
|
362
|
-
- ./lib/apollo_crawler/logger/base_logger.rb
|
363
|
-
- ./lib/apollo_crawler/program.rb
|
364
|
-
- ./lib/apollo_crawler/config.rb
|
365
|
-
- ./lib/apollo_crawler/cache/factory.rb
|
366
|
-
- ./lib/apollo_crawler/cache/null_cache.rb
|
367
|
-
- ./lib/apollo_crawler/cache/memory_cache.rb
|
368
|
-
- ./lib/apollo_crawler/cache/base_cache.rb
|
369
|
-
- ./lib/apollo_crawler/cache/mongo_cache.rb
|
370
|
-
- ./lib/apollo_crawler/cache/memcached_cache.rb
|
371
|
-
- ./lib/apollo_crawler/crawler/xkcd_crawler.rb
|
372
|
-
- ./lib/apollo_crawler/crawler/google_crawler.rb
|
373
|
-
- ./lib/apollo_crawler/crawler/youjizz_crawler.rb
|
374
398
|
- ./lib/apollo_crawler/crawler/slashdot_crawler.rb
|
399
|
+
- ./lib/apollo_crawler/crawler/stackoverflow_crawler.rb
|
375
400
|
- ./lib/apollo_crawler/crawler/hacker_news_crawler.rb
|
401
|
+
- ./lib/apollo_crawler/crawler/youjizz_crawler.rb
|
402
|
+
- ./lib/apollo_crawler/crawler/google_crawler.rb
|
376
403
|
- ./lib/apollo_crawler/crawler/base_crawler.rb
|
377
|
-
- ./lib/apollo_crawler/crawler/
|
404
|
+
- ./lib/apollo_crawler/crawler/xkcd_crawler.rb
|
405
|
+
- ./lib/apollo_crawler/crawler/crawlers.rb
|
406
|
+
- ./lib/apollo_crawler/logger/loggers.rb
|
407
|
+
- ./lib/apollo_crawler/logger/console_logger.rb
|
408
|
+
- ./lib/apollo_crawler/logger/base_logger.rb
|
409
|
+
- ./lib/apollo_crawler/config.rb
|
378
410
|
- ./lib/apollo_crawler/formatter/table_formatter.rb
|
379
411
|
- ./lib/apollo_crawler/formatter/base_formatter.rb
|
380
412
|
- ./lib/apollo_crawler/formatter/json_formatter.rb
|
381
413
|
- ./lib/apollo_crawler/formatter/plain_formatter.rb
|
414
|
+
- ./lib/apollo_crawler/formatter/formatters.rb
|
415
|
+
- ./lib/apollo_crawler/cache/memcached_cache.rb
|
416
|
+
- ./lib/apollo_crawler/cache/memory_cache.rb
|
417
|
+
- ./lib/apollo_crawler/cache/mongo_cache.rb
|
418
|
+
- ./lib/apollo_crawler/cache/null_cache.rb
|
419
|
+
- ./lib/apollo_crawler/cache/factory.rb
|
420
|
+
- ./lib/apollo_crawler/cache/caches.rb
|
421
|
+
- ./lib/apollo_crawler/cache/sqlite_cache.rb
|
422
|
+
- ./lib/apollo_crawler/cache/base_cache.rb
|
382
423
|
- ./lib/apollo_crawler/store/base_store.rb
|
424
|
+
- ./lib/apollo_crawler/store/stores.rb
|
425
|
+
- ./lib/apollo_crawler/helper/core_helper.rb
|
426
|
+
- ./lib/apollo_crawler/helper/helpers.rb
|
383
427
|
- ./lib/apollo_crawler.rb
|
384
428
|
- bin/apollo-crawler
|
385
429
|
homepage: https://github.com/korczis/apollo-crawler
|
386
430
|
licenses:
|
387
431
|
- MIT
|
388
|
-
metadata: {}
|
389
432
|
post_install_message: Thanks for installing Apollo Crawler!
|
390
433
|
rdoc_options: []
|
391
434
|
require_paths:
|
392
435
|
- lib
|
393
436
|
required_ruby_version: !ruby/object:Gem::Requirement
|
437
|
+
none: false
|
394
438
|
requirements:
|
395
439
|
- - ! '>='
|
396
440
|
- !ruby/object:Gem::Version
|
397
441
|
version: '0'
|
398
442
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
443
|
+
none: false
|
399
444
|
requirements:
|
400
445
|
- - ! '>='
|
401
446
|
- !ruby/object:Gem::Version
|
402
447
|
version: '0'
|
403
448
|
requirements: []
|
404
449
|
rubyforge_project:
|
405
|
-
rubygems_version:
|
450
|
+
rubygems_version: 1.8.23
|
406
451
|
signing_key:
|
407
|
-
specification_version:
|
452
|
+
specification_version: 3
|
408
453
|
summary: Apollo Platform Crawler
|
409
454
|
test_files: []
|
checksums.yaml
DELETED
@@ -1,15 +0,0 @@
|
|
1
|
-
---
|
2
|
-
!binary "U0hBMQ==":
|
3
|
-
metadata.gz: !binary |-
|
4
|
-
NDZmNjQ3N2FkZmVkYjc5NjQ3NjJiZTMyZGFlYjY4ODA0ZDgwYWE2OA==
|
5
|
-
data.tar.gz: !binary |-
|
6
|
-
OTA2NmJhOTEyYmJiMTFiYTFmZWFjNzY3YmEzYjYyZTc0MzZlMDk1Mw==
|
7
|
-
!binary "U0hBNTEy":
|
8
|
-
metadata.gz: !binary |-
|
9
|
-
MDk1NDcwN2FhYjA0YTVjNTA4N2FhOWRjNzcyMTJlMDg3ZjlmNmM2NTQyMjY2
|
10
|
-
ZTZjNjk1YjQ2MmYxMGU2MzViNmJkYjU3OTFlMDk2MTEzMjE2MTdkMGU3NWQ4
|
11
|
-
ZDdlN2Y0YTc1MjQ4NmRiZDc2ZGExMTkwMmViODVkYjY2MmI1YjI=
|
12
|
-
data.tar.gz: !binary |-
|
13
|
-
MmZlM2IxNjUyMDg3Yjk3YjE5ODQ3OWM4NWY2NzIwNzEwODQ0OWJlMGI3MmQ5
|
14
|
-
ZmJkY2E3NDljYTJiMjhmYzMxYzY2ZTZlMTJmYTAyYjA4NWMxYTdkYmU1ZGUz
|
15
|
-
YzAwMzlhODY0NGQyMzNlMTc5MjQ1OTI0NzEzZmY2NmExMjA0Zjg=
|