apollo-crawler 0.1.8 → 0.1.9
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +8 -8
- data/lib/apollo_crawler.rb +20 -13
- data/lib/apollo_crawler/cache/{cache_base.rb → base_cache.rb} +4 -4
- data/lib/apollo_crawler/cache/factory.rb +1 -1
- data/lib/apollo_crawler/cache/memcached_cache.rb +5 -5
- data/lib/apollo_crawler/cache/memory_cache.rb +5 -5
- data/lib/apollo_crawler/cache/null_cache.rb +3 -3
- data/lib/apollo_crawler/config.rb +25 -10
- data/lib/apollo_crawler/crawler/{crawler_base.rb → base_crawler.rb} +41 -22
- data/lib/apollo_crawler/crawler/{google_com/google.rb → google_crawler.rb} +7 -7
- data/lib/apollo_crawler/crawler/{ycombinator_com/hacker_news.rb → hacker_news_crawler.rb} +7 -7
- data/lib/apollo_crawler/crawler/{slashdot_org/slashdot.rb → slashdot_crawler.rb} +7 -7
- data/lib/apollo_crawler/crawler/{stackoverflow_com/stackoverflow.rb → stackoverflow_crawler.rb} +7 -7
- data/lib/apollo_crawler/crawler/{xkcd_com/xkcd.rb → xkcd_crawler.rb} +5 -5
- data/lib/apollo_crawler/fetcher/base_fetcher.rb +11 -0
- data/lib/apollo_crawler/fetcher/simple_fetcher.rb +12 -5
- data/lib/apollo_crawler/fetcher/smart_fetcher.rb +15 -0
- data/lib/apollo_crawler/formatter/base_formatter.rb +9 -0
- data/lib/apollo_crawler/formatter/{formatter_json.rb → json_formatter.rb} +5 -5
- data/lib/apollo_crawler/formatter/plain_formatter.rb +17 -0
- data/lib/apollo_crawler/formatter/{formatter_table.rb → table_formatter.rb} +5 -5
- data/lib/apollo_crawler/lib.rb +20 -13
- data/lib/apollo_crawler/logger/base_logger.rb +6 -0
- data/lib/apollo_crawler/logger/console_logger.rb +15 -0
- data/lib/apollo_crawler/program.rb +15 -8
- data/lib/apollo_crawler/store/base_store.rb +6 -0
- data/lib/apollo_crawler/version.rb +1 -1
- metadata +17 -16
- data/lib/apollo_crawler/cache/filesystem_cache.rb +0 -37
- data/lib/apollo_crawler/crawler/crawler_template.rb +0 -24
- data/lib/apollo_crawler/fetcher/fetcher_base.rb +0 -6
- data/lib/apollo_crawler/formatter/formatter_base.rb +0 -6
- data/lib/apollo_crawler/formatter/formatter_plain.rb +0 -17
- data/lib/apollo_crawler/store/store_base.rb +0 -6
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
Y2I3MDM1OWQ1NmU2ZDMzMTg0OGRhNTYzODc5Mzg4MDhhZTkxOWJlMQ==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
ZWY0N2M3ZjU5ZGNmZjgwMTdkNWI0Y2JhYmZjZmUwNDFjYTA5ZjAwOA==
|
7
7
|
!binary "U0hBNTEy":
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
MjM1ZWUzMGUyYzBiYjdmNmFjMmE4OTU5MjA0NDE3YWJlZTE4OWQ0YTE5MWQx
|
10
|
+
ZDBmN2UzZjk3N2Y3NDYwOGNiMWFiY2JkY2I2ODJmYzFkYWU3YTYzYjI5YTI0
|
11
|
+
NDQxZDk4YzBlODBmNDg3MzRkMDU2OWY1ZmViNmUzYWFhZjZlNGU=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
MDIzY2RhMmFkNmI4MjBlYzkzNmZhOTA5NjM2NWUyM2YzODQwM2M0NGIzN2U1
|
14
|
+
OTgxNGYwYTdhMTIxZGVkYzNlYjE1M2U1N2NjMDdiN2I4MWNkMTMwMzRmMDMy
|
15
|
+
MWFiOWRhNGIyOTZhY2NmZDE2YTRjMzUxYjQyODU1NTAzNjQwM2M=
|
data/lib/apollo_crawler.rb
CHANGED
@@ -5,27 +5,34 @@
|
|
5
5
|
require File.join(File.dirname(__FILE__), 'apollo_crawler/program')
|
6
6
|
|
7
7
|
# Caches
|
8
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/
|
8
|
+
require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/base_cache')
|
9
9
|
require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/factory')
|
10
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/filesystem_cache')
|
11
10
|
require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/memcached_cache')
|
12
11
|
require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/memory_cache')
|
13
12
|
require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/null_cache')
|
14
13
|
|
15
14
|
# Crawlers
|
16
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/
|
17
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/
|
18
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/
|
19
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/
|
20
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/
|
21
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/
|
15
|
+
require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/base_crawler')
|
16
|
+
require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/google_crawler')
|
17
|
+
require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/hacker_news_crawler')
|
18
|
+
require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/slashdot_crawler')
|
19
|
+
require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/stackoverflow_crawler')
|
20
|
+
require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/xkcd_crawler')
|
22
21
|
|
23
22
|
# Fetchers
|
24
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/fetcher/
|
23
|
+
require File.join(File.dirname(__FILE__), 'apollo_crawler/fetcher/base_fetcher')
|
25
24
|
require File.join(File.dirname(__FILE__), 'apollo_crawler/fetcher/simple_fetcher')
|
25
|
+
require File.join(File.dirname(__FILE__), 'apollo_crawler/fetcher/smart_fetcher')
|
26
26
|
|
27
27
|
# Formatters
|
28
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/formatter/
|
29
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/formatter/
|
30
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/formatter/
|
31
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/formatter/
|
28
|
+
require File.join(File.dirname(__FILE__), 'apollo_crawler/formatter/base_formatter')
|
29
|
+
require File.join(File.dirname(__FILE__), 'apollo_crawler/formatter/json_formatter')
|
30
|
+
require File.join(File.dirname(__FILE__), 'apollo_crawler/formatter/plain_formatter')
|
31
|
+
require File.join(File.dirname(__FILE__), 'apollo_crawler/formatter/table_formatter')
|
32
|
+
|
33
|
+
# Loggers
|
34
|
+
require File.join(File.dirname(__FILE__), 'apollo_crawler/logger/base_logger')
|
35
|
+
require File.join(File.dirname(__FILE__), 'apollo_crawler/logger/console_logger')
|
36
|
+
|
37
|
+
# Stores
|
38
|
+
require File.join(File.dirname(__FILE__), 'apollo_crawler/store/base_store')
|
@@ -1,6 +1,6 @@
|
|
1
1
|
module Apollo
|
2
2
|
module Cache
|
3
|
-
class
|
3
|
+
class BaseCache
|
4
4
|
# Get value associated with key from cache
|
5
5
|
def get(key, *args)
|
6
6
|
|
@@ -32,6 +32,6 @@ module Apollo
|
|
32
32
|
def clear
|
33
33
|
return
|
34
34
|
end
|
35
|
-
end #
|
36
|
-
end # Cache
|
37
|
-
end # Apollo
|
35
|
+
end # class BaseCache
|
36
|
+
end # module Cache
|
37
|
+
end # module Apollo
|
@@ -1,10 +1,10 @@
|
|
1
|
-
require File.join(File.dirname(__FILE__), '
|
1
|
+
require File.join(File.dirname(__FILE__), 'base_cache')
|
2
2
|
|
3
3
|
require 'dalli'
|
4
4
|
|
5
5
|
module Apollo
|
6
6
|
module Cache
|
7
|
-
class
|
7
|
+
class MemcachedCache < BaseCache
|
8
8
|
@cache = nil
|
9
9
|
|
10
10
|
def initialize
|
@@ -46,6 +46,6 @@ module Apollo
|
|
46
46
|
def clear
|
47
47
|
# TODO: Implement
|
48
48
|
end
|
49
|
-
end #
|
50
|
-
end # Cache
|
51
|
-
end # Apollo
|
49
|
+
end # class MemcachedCache
|
50
|
+
end # module Cache
|
51
|
+
end # module Apollo
|
@@ -1,8 +1,8 @@
|
|
1
|
-
require File.join(File.dirname(__FILE__), '
|
1
|
+
require File.join(File.dirname(__FILE__), 'base_cache')
|
2
2
|
|
3
3
|
module Apollo
|
4
4
|
module Cache
|
5
|
-
class
|
5
|
+
class MemoryCache < BaseCache
|
6
6
|
@cache = nil
|
7
7
|
|
8
8
|
def initialize
|
@@ -41,6 +41,6 @@ module Apollo
|
|
41
41
|
def clear
|
42
42
|
@cache.clear
|
43
43
|
end
|
44
|
-
end #
|
45
|
-
end # Cache
|
46
|
-
end # Apollo
|
44
|
+
end # class MemoryCache
|
45
|
+
end # module Cache
|
46
|
+
end # module Apollo
|
@@ -1,8 +1,8 @@
|
|
1
|
-
require File.join(File.dirname(__FILE__), '
|
1
|
+
require File.join(File.dirname(__FILE__), 'base_cache')
|
2
2
|
|
3
3
|
module Apollo
|
4
4
|
module Cache
|
5
|
-
class
|
5
|
+
class NullCache < BaseCache
|
6
6
|
# Get value associated with key from cache
|
7
7
|
def get(key, *args)
|
8
8
|
# Not found, Create, cache and return
|
@@ -28,6 +28,6 @@ module Apollo
|
|
28
28
|
def invalidate(key)
|
29
29
|
return true
|
30
30
|
end
|
31
|
-
end #
|
31
|
+
end # NullCache
|
32
32
|
end # Cache
|
33
33
|
end # Apollo
|
@@ -14,40 +14,55 @@ module RbConfig
|
|
14
14
|
############################################################
|
15
15
|
#
|
16
16
|
# Filesystem backend
|
17
|
-
# CACHE_CLASS = Apollo::Cache::
|
17
|
+
# CACHE_CLASS = Apollo::Cache::FilesystemCache
|
18
18
|
#
|
19
19
|
# Memcached - expects localhost:11211
|
20
|
-
# CACHE_CLASS = Apollo::Cache::
|
20
|
+
# CACHE_CLASS = Apollo::Cache::MemcachedCache
|
21
21
|
#
|
22
22
|
# Pure naive ruby in-memory implementation
|
23
|
-
# CACHE_CLASS = Apollo::Cache::
|
23
|
+
# CACHE_CLASS = Apollo::Cache::MemoryCache
|
24
24
|
#
|
25
25
|
# Null caching - no caching at all
|
26
|
-
# CACHE_CLASS = Apollo::Cache::
|
26
|
+
# CACHE_CLASS = Apollo::Cache::NullCache
|
27
27
|
|
28
28
|
# Used caching mechanism by default
|
29
|
-
CACHE_CLASS = Apollo::Cache::
|
29
|
+
CACHE_CLASS = Apollo::Cache::MemcachedCache
|
30
30
|
|
31
31
|
|
32
32
|
|
33
33
|
############################################################
|
34
34
|
# Crawlers - Built-in out-of box working crawlers
|
35
35
|
############################################################
|
36
|
-
CRAWLERS_DIR = File.join(File.dirname(__FILE__), "
|
36
|
+
CRAWLERS_DIR = File.join(File.dirname(__FILE__), "crawler")
|
37
37
|
|
38
38
|
# Template used for generated crawlers
|
39
|
-
CRAWLER_TEMPLATE_NAME = "crawler_template.
|
39
|
+
CRAWLER_TEMPLATE_NAME = "crawler_template.trb"
|
40
40
|
|
41
41
|
# Path of template
|
42
|
-
CRAWLER_TEMPLATE_PATH = File.join(
|
42
|
+
CRAWLER_TEMPLATE_PATH = File.join(CRAWLERS_DIR, CRAWLER_TEMPLATE_NAME)
|
43
43
|
|
44
44
|
|
45
45
|
|
46
|
+
############################################################
|
47
|
+
# Fetchers - used for fetching documents
|
48
|
+
############################################################
|
49
|
+
FETCHERS_DIR = File.join(File.dirname(__FILE__), "fetcher")
|
50
|
+
|
51
|
+
DEFAULT_FETCHER = Apollo::Fetcher::SmartFetcher
|
52
|
+
|
53
|
+
|
46
54
|
############################################################
|
47
55
|
# Formatters - used for formatting crawled documents results
|
48
56
|
############################################################
|
49
|
-
FORMATTERS_DIR = File.join(File.dirname(__FILE__), "
|
57
|
+
FORMATTERS_DIR = File.join(File.dirname(__FILE__), "formatter")
|
50
58
|
|
51
59
|
# Default formatter if no other specified
|
52
|
-
DEFAULT_FORMATTER = Apollo::Formatter::
|
60
|
+
DEFAULT_FORMATTER = Apollo::Formatter::JsonFormatter
|
61
|
+
|
62
|
+
|
63
|
+
|
64
|
+
############################################################
|
65
|
+
# Loggers - used for formatting output messages
|
66
|
+
############################################################
|
67
|
+
DEFAULT_LOGGER = Apollo::Logger::ConsoleLogger
|
53
68
|
end # Config
|
@@ -3,13 +3,19 @@ require "nokogiri"
|
|
3
3
|
|
4
4
|
module Apollo
|
5
5
|
module Crawler
|
6
|
-
class
|
6
|
+
class BaseCrawler
|
7
|
+
|
8
|
+
|
7
9
|
@backlog = nil
|
8
10
|
|
9
11
|
def initialize
|
10
12
|
@backlog = []
|
11
13
|
end
|
12
14
|
|
15
|
+
def self.name_re()
|
16
|
+
return /crawler$/
|
17
|
+
end
|
18
|
+
|
13
19
|
# Name of the crawler
|
14
20
|
def name
|
15
21
|
return "Crawler Base"
|
@@ -19,6 +25,10 @@ module Apollo
|
|
19
25
|
return nil
|
20
26
|
end
|
21
27
|
|
28
|
+
def self.fetch(url)
|
29
|
+
RbConfig::DEFAULT_FETCHER.fetch(url)
|
30
|
+
end
|
31
|
+
|
22
32
|
def self.try_get_url(root, url)
|
23
33
|
begin
|
24
34
|
return URI.join(root, url)
|
@@ -28,7 +38,7 @@ module Apollo
|
|
28
38
|
end
|
29
39
|
|
30
40
|
def self.try_get_doc(root, url)
|
31
|
-
doc =
|
41
|
+
doc = BaseCrawler.try_get_url(root, url)
|
32
42
|
|
33
43
|
# TODO: Set experition header
|
34
44
|
return {
|
@@ -47,16 +57,19 @@ module Apollo
|
|
47
57
|
url = self.url
|
48
58
|
end
|
49
59
|
|
60
|
+
# TODO: Be more agressive, use assert, it is clients responsibility!
|
50
61
|
if(url.nil?)
|
51
62
|
return nil
|
52
63
|
end
|
53
64
|
|
65
|
+
# We support both - list of urls or single url
|
54
66
|
if(url.kind_of?(Array))
|
55
67
|
@backlog.concat(url)
|
56
68
|
else
|
57
69
|
@backlog << url
|
58
70
|
end
|
59
71
|
|
72
|
+
# Counter of processed documents (pages)
|
60
73
|
docs_processed = 0
|
61
74
|
|
62
75
|
res = []
|
@@ -66,34 +79,40 @@ module Apollo
|
|
66
79
|
|
67
80
|
# puts "Processing '#{url}'"
|
68
81
|
doc = self.process_url(url)
|
69
|
-
res << doc
|
70
82
|
|
71
|
-
#
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
83
|
+
# Increase counter of processed documents
|
84
|
+
docs_processed = docs_processed + 1
|
85
|
+
|
86
|
+
# Process document if was successfuly retreived
|
87
|
+
if(!doc.nil?)
|
88
|
+
# TODO: Use log4r and log it only on info level
|
89
|
+
if block_given?
|
90
|
+
yield doc
|
91
|
+
end
|
76
92
|
|
77
|
-
|
78
|
-
|
79
|
-
url = link[:link].to_s
|
80
|
-
# TODO: Use log4r and log it only on info level
|
81
|
-
#puts url
|
93
|
+
# Add document to queue of results
|
94
|
+
res << doc
|
82
95
|
|
83
|
-
|
84
|
-
|
96
|
+
# If
|
97
|
+
if(doc[:links].nil? == false)
|
98
|
+
doc[:links].each do |link|
|
99
|
+
url = link[:link].to_s
|
100
|
+
# TODO: Use log4r and log it only on info level
|
101
|
+
#puts url
|
102
|
+
|
103
|
+
# TODO: Check if it is unique
|
104
|
+
@backlog << url
|
105
|
+
end
|
85
106
|
end
|
86
107
|
end
|
87
108
|
|
88
|
-
#
|
89
|
-
docs_processed = docs_processed + 1
|
109
|
+
# Break if limit of documents to processed was reached
|
90
110
|
break if opts[:doc_limit] && docs_processed >= opts[:doc_limit]
|
91
111
|
end
|
92
112
|
return res
|
93
113
|
end
|
94
114
|
|
95
115
|
def process_url(url)
|
96
|
-
# Try fetch document
|
97
116
|
doc = self.fetch_document(url)
|
98
117
|
if(doc.nil?)
|
99
118
|
return nil
|
@@ -140,7 +159,7 @@ module Apollo
|
|
140
159
|
res = nil
|
141
160
|
while(attempt_no < max_attempts && success == false) do
|
142
161
|
begin
|
143
|
-
res =
|
162
|
+
res = BaseCrawler.fetch(url)
|
144
163
|
success = true
|
145
164
|
rescue Exception => e
|
146
165
|
puts "EXCEPTION: Unable to fetch '#{url}', reason: '#{e.to_s}'"
|
@@ -170,6 +189,6 @@ module Apollo
|
|
170
189
|
res = []
|
171
190
|
return res
|
172
191
|
end
|
173
|
-
end #
|
174
|
-
end # Crawler
|
175
|
-
end # Apollo
|
192
|
+
end # class BaseCrawler
|
193
|
+
end # module Crawler
|
194
|
+
end # module Apollo
|
@@ -1,8 +1,8 @@
|
|
1
|
-
require File.join(File.dirname(__FILE__), '
|
1
|
+
require File.join(File.dirname(__FILE__), 'base_crawler')
|
2
2
|
|
3
3
|
module Apollo
|
4
4
|
module Crawler
|
5
|
-
class
|
5
|
+
class GoogleCrawler < BaseCrawler
|
6
6
|
@@MATCHER_ITEM = "//h3/a"
|
7
7
|
|
8
8
|
def name()
|
@@ -15,7 +15,7 @@ module Apollo
|
|
15
15
|
|
16
16
|
def extract_data(doc)
|
17
17
|
res = doc.xpath(@@MATCHER_ITEM).map { | node |
|
18
|
-
url =
|
18
|
+
url = BaseCrawler.try_get_url(self.url, node['href'])
|
19
19
|
next if url.nil?
|
20
20
|
|
21
21
|
{
|
@@ -27,7 +27,7 @@ module Apollo
|
|
27
27
|
|
28
28
|
def extract_links(doc)
|
29
29
|
res = doc.xpath("(//td[@class = 'b']/a)[last()]").map { | node |
|
30
|
-
res_doc =
|
30
|
+
res_doc = BaseCrawler.try_get_url(self.url, node['href'])
|
31
31
|
next if url.nil?
|
32
32
|
|
33
33
|
{
|
@@ -35,6 +35,6 @@ module Apollo
|
|
35
35
|
}
|
36
36
|
}
|
37
37
|
end
|
38
|
-
end
|
39
|
-
end # Crawler
|
40
|
-
end # Apollo
|
38
|
+
end # class GoogleCrawler
|
39
|
+
end # module Crawler
|
40
|
+
end # module Apollo
|
@@ -1,8 +1,8 @@
|
|
1
|
-
require File.join(File.dirname(__FILE__), '
|
1
|
+
require File.join(File.dirname(__FILE__), 'base_crawler')
|
2
2
|
|
3
3
|
module Apollo
|
4
4
|
module Crawler
|
5
|
-
class
|
5
|
+
class HackerNewsCrawler < BaseCrawler
|
6
6
|
@@MATCHER_ITEM = "(//td[@class = 'title']/a)[not(position() > last() -1)]"
|
7
7
|
|
8
8
|
def name()
|
@@ -15,7 +15,7 @@ module Apollo
|
|
15
15
|
|
16
16
|
def extract_data(doc)
|
17
17
|
res = doc.xpath(@@MATCHER_ITEM).map { |node|
|
18
|
-
url =
|
18
|
+
url = BaseCrawler.try_get_url(self.url, node['href'])
|
19
19
|
next if url.nil?
|
20
20
|
|
21
21
|
{
|
@@ -29,7 +29,7 @@ module Apollo
|
|
29
29
|
|
30
30
|
def extract_links(doc)
|
31
31
|
res = doc.xpath("(//a[@class = 'prevnextbutact'])").map { |node|
|
32
|
-
url =
|
32
|
+
url = BaseCrawler.try_get_url(self.url, node['href'])
|
33
33
|
next if url.nil?
|
34
34
|
|
35
35
|
{
|
@@ -39,6 +39,6 @@ module Apollo
|
|
39
39
|
|
40
40
|
return res.uniq
|
41
41
|
end
|
42
|
-
end
|
43
|
-
end # Crawler
|
44
|
-
end # Apollo
|
42
|
+
end # class HackerNewsCrawler
|
43
|
+
end # module Crawler
|
44
|
+
end # module Apollo
|
@@ -1,8 +1,8 @@
|
|
1
|
-
require File.join(File.dirname(__FILE__), '
|
1
|
+
require File.join(File.dirname(__FILE__), 'base_crawler')
|
2
2
|
|
3
3
|
module Apollo
|
4
4
|
module Crawler
|
5
|
-
class
|
5
|
+
class SlashdotCrawler < BaseCrawler
|
6
6
|
@@MATCHER_ITEM = "//article/header/h2/span/a"
|
7
7
|
|
8
8
|
def name()
|
@@ -15,7 +15,7 @@ module Apollo
|
|
15
15
|
|
16
16
|
def extract_data(doc)
|
17
17
|
res = doc.xpath(@@MATCHER_ITEM).map { | node |
|
18
|
-
url =
|
18
|
+
url = BaseCrawler.try_get_url(self.url, node['href'])
|
19
19
|
next if url.nil?
|
20
20
|
|
21
21
|
{
|
@@ -27,7 +27,7 @@ module Apollo
|
|
27
27
|
|
28
28
|
def extract_links(doc)
|
29
29
|
res = doc.xpath(@@MATCHER_ITEM).map { | node |
|
30
|
-
url =
|
30
|
+
url = BaseCrawler.try_get_url(self.url, node['href'])
|
31
31
|
next if url.nil?
|
32
32
|
|
33
33
|
{
|
@@ -35,6 +35,6 @@ module Apollo
|
|
35
35
|
}
|
36
36
|
}
|
37
37
|
end
|
38
|
-
end
|
39
|
-
end # Crawler
|
40
|
-
end # Apollo
|
38
|
+
end # class SlashdotCrawler
|
39
|
+
end # module Crawler
|
40
|
+
end # module Apollo
|
data/lib/apollo_crawler/crawler/{stackoverflow_com/stackoverflow.rb → stackoverflow_crawler.rb}
RENAMED
@@ -1,8 +1,8 @@
|
|
1
|
-
require File.join(File.dirname(__FILE__), '
|
1
|
+
require File.join(File.dirname(__FILE__), 'base_crawler')
|
2
2
|
|
3
3
|
module Apollo
|
4
4
|
module Crawler
|
5
|
-
class
|
5
|
+
class StackoverflowCrawler < BaseCrawler
|
6
6
|
@@MATCHER_ITEM = "//div[@class = 'summary']/h3/a"
|
7
7
|
|
8
8
|
def name()
|
@@ -15,7 +15,7 @@ module Apollo
|
|
15
15
|
|
16
16
|
def extract_data(doc)
|
17
17
|
res = doc.xpath(@@MATCHER_ITEM).map { |node|
|
18
|
-
url =
|
18
|
+
url = BaseCrawler.try_get_url(self.url, node['href'])
|
19
19
|
next if url.nil?
|
20
20
|
|
21
21
|
{
|
@@ -29,7 +29,7 @@ module Apollo
|
|
29
29
|
|
30
30
|
def extract_links(doc)
|
31
31
|
res = doc.xpath("(//div[@class = 'pager fl']/a)[last()]").map { |node|
|
32
|
-
url =
|
32
|
+
url = BaseCrawler.try_get_url(self.url, node['href'])
|
33
33
|
next if url.nil?
|
34
34
|
|
35
35
|
{
|
@@ -39,6 +39,6 @@ module Apollo
|
|
39
39
|
|
40
40
|
return res.uniq
|
41
41
|
end
|
42
|
-
end
|
43
|
-
end # Crawler
|
44
|
-
end # Apollo
|
42
|
+
end # class StackoverflowCrawler
|
43
|
+
end # module Crawler
|
44
|
+
end # module Apollo
|
@@ -1,8 +1,8 @@
|
|
1
|
-
require File.join(File.dirname(__FILE__), '
|
1
|
+
require File.join(File.dirname(__FILE__), 'base_crawler')
|
2
2
|
|
3
3
|
module Apollo
|
4
4
|
module Crawler
|
5
|
-
class
|
5
|
+
class XkcdCrawler < BaseCrawler
|
6
6
|
@@MATCHER_ITEM = "//div[@id = 'comic']/img"
|
7
7
|
|
8
8
|
def name()
|
@@ -30,6 +30,6 @@ module Apollo
|
|
30
30
|
}
|
31
31
|
res.uniq
|
32
32
|
end
|
33
|
-
end
|
34
|
-
end # Crawler
|
35
|
-
end # Apollo
|
33
|
+
end # class XkcdCrawler
|
34
|
+
end # module Crawler
|
35
|
+
end # module Apollo
|
@@ -1,8 +1,15 @@
|
|
1
|
-
require
|
1
|
+
require "open-uri"
|
2
|
+
require "nokogiri"
|
3
|
+
|
4
|
+
require File.join(File.dirname(__FILE__), 'base_fetcher')
|
2
5
|
|
3
6
|
module Apollo
|
4
7
|
module Fetcher
|
5
|
-
class SimpleFetcher <
|
6
|
-
|
7
|
-
|
8
|
-
|
8
|
+
class SimpleFetcher < BaseFetcher
|
9
|
+
def self.fetch(url)
|
10
|
+
# TODO: Throw exception ???
|
11
|
+
return open(url).read
|
12
|
+
end
|
13
|
+
end # class SimpleFetcher
|
14
|
+
end # module Fetcher
|
15
|
+
end # module Apollo
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require "open-uri"
|
2
|
+
require "nokogiri"
|
3
|
+
|
4
|
+
require File.join(File.dirname(__FILE__), 'base_fetcher')
|
5
|
+
|
6
|
+
module Apollo
|
7
|
+
module Fetcher
|
8
|
+
class SmartFetcher < BaseFetcher
|
9
|
+
def self.fetch(url)
|
10
|
+
# TODO: Throw exception ???
|
11
|
+
return open(url).read
|
12
|
+
end
|
13
|
+
end # class SimpleFetcher
|
14
|
+
end # module SmartFetcher
|
15
|
+
end # module Apollo
|
@@ -1,10 +1,10 @@
|
|
1
1
|
require 'json'
|
2
2
|
|
3
|
-
require File.join(File.dirname(__FILE__), '
|
3
|
+
require File.join(File.dirname(__FILE__), 'base_formatter')
|
4
4
|
|
5
5
|
module Apollo
|
6
6
|
module Formatter
|
7
|
-
class
|
7
|
+
class JsonFormatter < BaseFormatter
|
8
8
|
def format(obj)
|
9
9
|
return Json.format(obj)
|
10
10
|
end
|
@@ -12,6 +12,6 @@ module Apollo
|
|
12
12
|
def self.format(obj)
|
13
13
|
return JSON.pretty_generate(obj)
|
14
14
|
end
|
15
|
-
end
|
16
|
-
end # Formatter
|
17
|
-
end # Apollo
|
15
|
+
end # class JsonFormatter
|
16
|
+
end # module Formatter
|
17
|
+
end # module Apollo
|
@@ -0,0 +1,17 @@
|
|
1
|
+
require 'awesome_print'
|
2
|
+
|
3
|
+
require File.join(File.dirname(__FILE__), 'base_formatter')
|
4
|
+
|
5
|
+
module Apollo
|
6
|
+
module Formatter
|
7
|
+
class PlainFormatter < BaseFormatter
|
8
|
+
def format(obj)
|
9
|
+
return Plain.format(obj)
|
10
|
+
end
|
11
|
+
|
12
|
+
def self.format(obj)
|
13
|
+
return obj.inspect
|
14
|
+
end
|
15
|
+
end # class PlainFormatter
|
16
|
+
end # module Formatter
|
17
|
+
end # module Apollo
|
@@ -1,10 +1,10 @@
|
|
1
1
|
require 'terminal-table'
|
2
2
|
|
3
|
-
require File.join(File.dirname(__FILE__), '
|
3
|
+
require File.join(File.dirname(__FILE__), 'base_formatter')
|
4
4
|
|
5
5
|
module Apollo
|
6
6
|
module Formatter
|
7
|
-
class
|
7
|
+
class TableFormatter < BaseFormatter
|
8
8
|
def format(obj)
|
9
9
|
return Table.format(obj)
|
10
10
|
end
|
@@ -30,6 +30,6 @@ module Apollo
|
|
30
30
|
table = Terminal::Table.new :headings => headings, :rows => rows
|
31
31
|
return table
|
32
32
|
end
|
33
|
-
end
|
34
|
-
end # Formatter
|
35
|
-
end # Apollo
|
33
|
+
end # class TableFormatter
|
34
|
+
end # module Formatter
|
35
|
+
end # module Apollo
|
data/lib/apollo_crawler/lib.rb
CHANGED
@@ -2,27 +2,34 @@
|
|
2
2
|
require File.join(File.dirname(__FILE__), 'program')
|
3
3
|
|
4
4
|
# Caches
|
5
|
-
require File.join(File.dirname(__FILE__), 'cache/
|
5
|
+
require File.join(File.dirname(__FILE__), 'cache/base_cache')
|
6
6
|
require File.join(File.dirname(__FILE__), 'cache/factory')
|
7
|
-
require File.join(File.dirname(__FILE__), 'cache/filesystem_cache')
|
8
7
|
require File.join(File.dirname(__FILE__), 'cache/memcached_cache')
|
9
8
|
require File.join(File.dirname(__FILE__), 'cache/memory_cache')
|
10
9
|
require File.join(File.dirname(__FILE__), 'cache/null_cache')
|
11
10
|
|
12
11
|
# Crawlers
|
13
|
-
require File.join(File.dirname(__FILE__), 'crawler/
|
14
|
-
require File.join(File.dirname(__FILE__), 'crawler/
|
15
|
-
require File.join(File.dirname(__FILE__), 'crawler/
|
16
|
-
require File.join(File.dirname(__FILE__), 'crawler/
|
17
|
-
require File.join(File.dirname(__FILE__), 'crawler/
|
18
|
-
require File.join(File.dirname(__FILE__), 'crawler/
|
12
|
+
require File.join(File.dirname(__FILE__), 'crawler/base_crawler')
|
13
|
+
require File.join(File.dirname(__FILE__), 'crawler/google_crawler')
|
14
|
+
require File.join(File.dirname(__FILE__), 'crawler/hacker_news_crawler')
|
15
|
+
require File.join(File.dirname(__FILE__), 'crawler/slashdot_crawler')
|
16
|
+
require File.join(File.dirname(__FILE__), 'crawler/stackoverflow_crawler')
|
17
|
+
require File.join(File.dirname(__FILE__), 'crawler/xkcd_crawler')
|
19
18
|
|
20
19
|
# Fetchers
|
21
|
-
require File.join(File.dirname(__FILE__), 'fetcher/
|
20
|
+
require File.join(File.dirname(__FILE__), 'fetcher/base_fetcher')
|
22
21
|
require File.join(File.dirname(__FILE__), 'fetcher/simple_fetcher')
|
22
|
+
require File.join(File.dirname(__FILE__), 'fetcher/smart_fetcher')
|
23
23
|
|
24
24
|
# Formatters
|
25
|
-
require File.join(File.dirname(__FILE__), 'formatter/
|
26
|
-
require File.join(File.dirname(__FILE__), 'formatter/
|
27
|
-
require File.join(File.dirname(__FILE__), 'formatter/
|
28
|
-
require File.join(File.dirname(__FILE__), 'formatter/
|
25
|
+
require File.join(File.dirname(__FILE__), 'formatter/base_formatter')
|
26
|
+
require File.join(File.dirname(__FILE__), 'formatter/json_formatter')
|
27
|
+
require File.join(File.dirname(__FILE__), 'formatter/plain_formatter')
|
28
|
+
require File.join(File.dirname(__FILE__), 'formatter/table_formatter')
|
29
|
+
|
30
|
+
# Loggers
|
31
|
+
require File.join(File.dirname(__FILE__), 'logger/base_logger')
|
32
|
+
require File.join(File.dirname(__FILE__), 'logger/console_logger')
|
33
|
+
|
34
|
+
# Stores
|
35
|
+
require File.join(File.dirname(__FILE__), 'store/base_store')
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), 'base_logger')
|
2
|
+
|
3
|
+
module Apollo
|
4
|
+
module Logger
|
5
|
+
class ConsoleLogger < BaseLogger
|
6
|
+
def log(msg)
|
7
|
+
puts msg
|
8
|
+
end
|
9
|
+
|
10
|
+
def self.log(msg)
|
11
|
+
return Logger.log(msg)
|
12
|
+
end
|
13
|
+
end # class ConsoleLogger
|
14
|
+
end # module Logger
|
15
|
+
end # module Apollo
|
@@ -125,7 +125,7 @@ module Apollo
|
|
125
125
|
config = File.join(File.dirname(__FILE__), "config", "crawler.rb")
|
126
126
|
if(File.exists?(config))
|
127
127
|
if(@options[:verbose])
|
128
|
-
|
128
|
+
RbConfig::DEFAULT_LOGGER "Loading config '#{config}'"
|
129
129
|
end
|
130
130
|
|
131
131
|
# puts "Let's require '#{@options[:verbose]}'"
|
@@ -192,7 +192,8 @@ module Apollo
|
|
192
192
|
|
193
193
|
tmp.each do |x|
|
194
194
|
klass = Object.const_get('Apollo').const_get('Crawler').const_get(x)
|
195
|
-
|
195
|
+
name = x.to_s.downcase.gsub(Apollo::Crawler::BaseCrawler.name_re,"")
|
196
|
+
@crawlers.merge!({ name => klass})
|
196
197
|
end
|
197
198
|
|
198
199
|
if(@options[:verbose])
|
@@ -225,7 +226,8 @@ module Apollo
|
|
225
226
|
|
226
227
|
tmp.each do |x|
|
227
228
|
klass = Object.const_get('Apollo').const_get('Formatter').const_get(x)
|
228
|
-
|
229
|
+
name = x.to_s.downcase.gsub(Apollo::Formatter::BaseFormatter.name_re,"")
|
230
|
+
@formatters.merge!({ name => klass})
|
229
231
|
end
|
230
232
|
|
231
233
|
if(@options[:verbose])
|
@@ -266,9 +268,10 @@ module Apollo
|
|
266
268
|
end
|
267
269
|
|
268
270
|
template_path = RbConfig::CRAWLER_TEMPLATE_PATH
|
271
|
+
puts template_path
|
269
272
|
if(File.exists?(template_path) == false)
|
270
273
|
puts "Template file '#{template_path}' does not exists!"
|
271
|
-
return
|
274
|
+
return -1
|
272
275
|
end
|
273
276
|
|
274
277
|
if(@options[:verbose])
|
@@ -301,6 +304,8 @@ module Apollo
|
|
301
304
|
end
|
302
305
|
end
|
303
306
|
end
|
307
|
+
|
308
|
+
return 0
|
304
309
|
end
|
305
310
|
|
306
311
|
def self.console_table(headings, rows)
|
@@ -342,8 +347,7 @@ module Apollo
|
|
342
347
|
url = args.length > 0 ? args[0] : nil
|
343
348
|
matcher = args.length > 1 ? args[1] : nil
|
344
349
|
|
345
|
-
self.generate_crawler(name, url, matcher)
|
346
|
-
return 0
|
350
|
+
return self.generate_crawler(name, url, matcher)
|
347
351
|
end
|
348
352
|
|
349
353
|
register_modules()
|
@@ -356,7 +360,8 @@ module Apollo
|
|
356
360
|
|
357
361
|
# Look for specified formatter
|
358
362
|
f = @formatters.select { |k, v|
|
359
|
-
|
363
|
+
name = formatter_name.gsub(Apollo::Formatter::BaseFormatter::name_re, "")
|
364
|
+
k.downcase == name
|
360
365
|
}
|
361
366
|
|
362
367
|
if(f)
|
@@ -388,7 +393,9 @@ module Apollo
|
|
388
393
|
end
|
389
394
|
|
390
395
|
crawlers.each do |crawler|
|
391
|
-
|
396
|
+
crawler_name = crawler.downcase.gsub(Apollo::Crawler::BaseCrawler.name_re, "")
|
397
|
+
|
398
|
+
p = @crawlers[crawler_name]
|
392
399
|
if(p == nil)
|
393
400
|
puts "Invalid crawler name - '#{crawler}'"
|
394
401
|
puts "See program help"
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: apollo-crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tomas Korcak
|
@@ -227,30 +227,31 @@ executables:
|
|
227
227
|
extensions: []
|
228
228
|
extra_rdoc_files: []
|
229
229
|
files:
|
230
|
+
- ./lib/apollo_crawler/fetcher/smart_fetcher.rb
|
230
231
|
- ./lib/apollo_crawler/fetcher/simple_fetcher.rb
|
231
|
-
- ./lib/apollo_crawler/fetcher/
|
232
|
+
- ./lib/apollo_crawler/fetcher/base_fetcher.rb
|
232
233
|
- ./lib/apollo_crawler/lib.rb
|
233
234
|
- ./lib/apollo_crawler/version.rb
|
235
|
+
- ./lib/apollo_crawler/logger/console_logger.rb
|
236
|
+
- ./lib/apollo_crawler/logger/base_logger.rb
|
234
237
|
- ./lib/apollo_crawler/program.rb
|
235
238
|
- ./lib/apollo_crawler/config.rb
|
236
239
|
- ./lib/apollo_crawler/cache/factory.rb
|
237
240
|
- ./lib/apollo_crawler/cache/null_cache.rb
|
238
|
-
- ./lib/apollo_crawler/cache/cache_base.rb
|
239
241
|
- ./lib/apollo_crawler/cache/memory_cache.rb
|
240
|
-
- ./lib/apollo_crawler/cache/
|
242
|
+
- ./lib/apollo_crawler/cache/base_cache.rb
|
241
243
|
- ./lib/apollo_crawler/cache/memcached_cache.rb
|
242
|
-
- ./lib/apollo_crawler/crawler/
|
243
|
-
- ./lib/apollo_crawler/crawler/
|
244
|
-
- ./lib/apollo_crawler/crawler/
|
245
|
-
- ./lib/apollo_crawler/crawler/
|
246
|
-
- ./lib/apollo_crawler/crawler/
|
247
|
-
- ./lib/apollo_crawler/crawler/
|
248
|
-
- ./lib/apollo_crawler/
|
249
|
-
- ./lib/apollo_crawler/formatter/
|
250
|
-
- ./lib/apollo_crawler/formatter/
|
251
|
-
- ./lib/apollo_crawler/formatter/
|
252
|
-
- ./lib/apollo_crawler/
|
253
|
-
- ./lib/apollo_crawler/store/store_base.rb
|
244
|
+
- ./lib/apollo_crawler/crawler/xkcd_crawler.rb
|
245
|
+
- ./lib/apollo_crawler/crawler/google_crawler.rb
|
246
|
+
- ./lib/apollo_crawler/crawler/slashdot_crawler.rb
|
247
|
+
- ./lib/apollo_crawler/crawler/hacker_news_crawler.rb
|
248
|
+
- ./lib/apollo_crawler/crawler/base_crawler.rb
|
249
|
+
- ./lib/apollo_crawler/crawler/stackoverflow_crawler.rb
|
250
|
+
- ./lib/apollo_crawler/formatter/table_formatter.rb
|
251
|
+
- ./lib/apollo_crawler/formatter/base_formatter.rb
|
252
|
+
- ./lib/apollo_crawler/formatter/json_formatter.rb
|
253
|
+
- ./lib/apollo_crawler/formatter/plain_formatter.rb
|
254
|
+
- ./lib/apollo_crawler/store/base_store.rb
|
254
255
|
- ./lib/apollo_crawler.rb
|
255
256
|
- bin/apollo-crawler
|
256
257
|
homepage: https://github.com/korczis/apollo-crawler
|
@@ -1,37 +0,0 @@
|
|
1
|
-
require File.join(File.dirname(__FILE__), 'cache_base')
|
2
|
-
|
3
|
-
module Apollo
|
4
|
-
module Cache
|
5
|
-
class Filesystem < CacheBase
|
6
|
-
def initialize
|
7
|
-
# puts "This if Filesystem cache"
|
8
|
-
end
|
9
|
-
|
10
|
-
# Get value associated with key from cache
|
11
|
-
def get(key, *args)
|
12
|
-
# Not found, Create, cache and return
|
13
|
-
if block_given?
|
14
|
-
res = yield args
|
15
|
-
end
|
16
|
-
|
17
|
-
return res
|
18
|
-
end
|
19
|
-
|
20
|
-
# Set value associated with key
|
21
|
-
# Return cached value
|
22
|
-
def set(key, value)
|
23
|
-
return value
|
24
|
-
end
|
25
|
-
|
26
|
-
# Check if cache contains specified key
|
27
|
-
def contains(key)
|
28
|
-
return false
|
29
|
-
end
|
30
|
-
|
31
|
-
# Invalidate key/value pair
|
32
|
-
def invalidate(key)
|
33
|
-
return true
|
34
|
-
end
|
35
|
-
end # Filesystem
|
36
|
-
end # Cache
|
37
|
-
end # Apollo
|
@@ -1,24 +0,0 @@
|
|
1
|
-
module Apollo
|
2
|
-
module Crawler
|
3
|
-
class CRAWLER_CLASS_NAME < Crawler
|
4
|
-
@@MATCHER_ITEM = "CRAWLER_MATCHER"
|
5
|
-
|
6
|
-
def name()
|
7
|
-
return "CRAWLER_NAME"
|
8
|
-
end
|
9
|
-
|
10
|
-
def url()
|
11
|
-
return "CRAWLER_URL"
|
12
|
-
end
|
13
|
-
|
14
|
-
def extract_data(doc)
|
15
|
-
res = doc.xpath(@@MATCHER_ITEM).map { |i|
|
16
|
-
{
|
17
|
-
:text => i.text,
|
18
|
-
:link => URI.join(self.url, i['href'])
|
19
|
-
}
|
20
|
-
}
|
21
|
-
end
|
22
|
-
end # CRAWLER_CLASS_NAME
|
23
|
-
end # Crawler
|
24
|
-
end # Apollo
|
@@ -1,17 +0,0 @@
|
|
1
|
-
require 'awesome_print'
|
2
|
-
|
3
|
-
require File.join(File.dirname(__FILE__), 'formatter_base')
|
4
|
-
|
5
|
-
module Apollo
|
6
|
-
module Formatter
|
7
|
-
class Plain < FormatterBase
|
8
|
-
def format(obj)
|
9
|
-
return Plain.format(obj)
|
10
|
-
end
|
11
|
-
|
12
|
-
def self.format(obj)
|
13
|
-
return obj.inspect
|
14
|
-
end
|
15
|
-
end
|
16
|
-
end # Formatter
|
17
|
-
end # Apollo
|