apollo-crawler 0.1.8 → 0.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/lib/apollo_crawler.rb +20 -13
- data/lib/apollo_crawler/cache/{cache_base.rb → base_cache.rb} +4 -4
- data/lib/apollo_crawler/cache/factory.rb +1 -1
- data/lib/apollo_crawler/cache/memcached_cache.rb +5 -5
- data/lib/apollo_crawler/cache/memory_cache.rb +5 -5
- data/lib/apollo_crawler/cache/null_cache.rb +3 -3
- data/lib/apollo_crawler/config.rb +25 -10
- data/lib/apollo_crawler/crawler/{crawler_base.rb → base_crawler.rb} +41 -22
- data/lib/apollo_crawler/crawler/{google_com/google.rb → google_crawler.rb} +7 -7
- data/lib/apollo_crawler/crawler/{ycombinator_com/hacker_news.rb → hacker_news_crawler.rb} +7 -7
- data/lib/apollo_crawler/crawler/{slashdot_org/slashdot.rb → slashdot_crawler.rb} +7 -7
- data/lib/apollo_crawler/crawler/{stackoverflow_com/stackoverflow.rb → stackoverflow_crawler.rb} +7 -7
- data/lib/apollo_crawler/crawler/{xkcd_com/xkcd.rb → xkcd_crawler.rb} +5 -5
- data/lib/apollo_crawler/fetcher/base_fetcher.rb +11 -0
- data/lib/apollo_crawler/fetcher/simple_fetcher.rb +12 -5
- data/lib/apollo_crawler/fetcher/smart_fetcher.rb +15 -0
- data/lib/apollo_crawler/formatter/base_formatter.rb +9 -0
- data/lib/apollo_crawler/formatter/{formatter_json.rb → json_formatter.rb} +5 -5
- data/lib/apollo_crawler/formatter/plain_formatter.rb +17 -0
- data/lib/apollo_crawler/formatter/{formatter_table.rb → table_formatter.rb} +5 -5
- data/lib/apollo_crawler/lib.rb +20 -13
- data/lib/apollo_crawler/logger/base_logger.rb +6 -0
- data/lib/apollo_crawler/logger/console_logger.rb +15 -0
- data/lib/apollo_crawler/program.rb +15 -8
- data/lib/apollo_crawler/store/base_store.rb +6 -0
- data/lib/apollo_crawler/version.rb +1 -1
- metadata +17 -16
- data/lib/apollo_crawler/cache/filesystem_cache.rb +0 -37
- data/lib/apollo_crawler/crawler/crawler_template.rb +0 -24
- data/lib/apollo_crawler/fetcher/fetcher_base.rb +0 -6
- data/lib/apollo_crawler/formatter/formatter_base.rb +0 -6
- data/lib/apollo_crawler/formatter/formatter_plain.rb +0 -17
- data/lib/apollo_crawler/store/store_base.rb +0 -6
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
Y2I3MDM1OWQ1NmU2ZDMzMTg0OGRhNTYzODc5Mzg4MDhhZTkxOWJlMQ==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
ZWY0N2M3ZjU5ZGNmZjgwMTdkNWI0Y2JhYmZjZmUwNDFjYTA5ZjAwOA==
|
7
7
|
!binary "U0hBNTEy":
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
MjM1ZWUzMGUyYzBiYjdmNmFjMmE4OTU5MjA0NDE3YWJlZTE4OWQ0YTE5MWQx
|
10
|
+
ZDBmN2UzZjk3N2Y3NDYwOGNiMWFiY2JkY2I2ODJmYzFkYWU3YTYzYjI5YTI0
|
11
|
+
NDQxZDk4YzBlODBmNDg3MzRkMDU2OWY1ZmViNmUzYWFhZjZlNGU=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
MDIzY2RhMmFkNmI4MjBlYzkzNmZhOTA5NjM2NWUyM2YzODQwM2M0NGIzN2U1
|
14
|
+
OTgxNGYwYTdhMTIxZGVkYzNlYjE1M2U1N2NjMDdiN2I4MWNkMTMwMzRmMDMy
|
15
|
+
MWFiOWRhNGIyOTZhY2NmZDE2YTRjMzUxYjQyODU1NTAzNjQwM2M=
|
data/lib/apollo_crawler.rb
CHANGED
@@ -5,27 +5,34 @@
|
|
5
5
|
require File.join(File.dirname(__FILE__), 'apollo_crawler/program')
|
6
6
|
|
7
7
|
# Caches
|
8
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/
|
8
|
+
require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/base_cache')
|
9
9
|
require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/factory')
|
10
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/filesystem_cache')
|
11
10
|
require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/memcached_cache')
|
12
11
|
require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/memory_cache')
|
13
12
|
require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/null_cache')
|
14
13
|
|
15
14
|
# Crawlers
|
16
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/
|
17
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/
|
18
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/
|
19
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/
|
20
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/
|
21
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/
|
15
|
+
require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/base_crawler')
|
16
|
+
require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/google_crawler')
|
17
|
+
require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/hacker_news_crawler')
|
18
|
+
require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/slashdot_crawler')
|
19
|
+
require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/stackoverflow_crawler')
|
20
|
+
require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/xkcd_crawler')
|
22
21
|
|
23
22
|
# Fetchers
|
24
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/fetcher/
|
23
|
+
require File.join(File.dirname(__FILE__), 'apollo_crawler/fetcher/base_fetcher')
|
25
24
|
require File.join(File.dirname(__FILE__), 'apollo_crawler/fetcher/simple_fetcher')
|
25
|
+
require File.join(File.dirname(__FILE__), 'apollo_crawler/fetcher/smart_fetcher')
|
26
26
|
|
27
27
|
# Formatters
|
28
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/formatter/
|
29
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/formatter/
|
30
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/formatter/
|
31
|
-
require File.join(File.dirname(__FILE__), 'apollo_crawler/formatter/
|
28
|
+
require File.join(File.dirname(__FILE__), 'apollo_crawler/formatter/base_formatter')
|
29
|
+
require File.join(File.dirname(__FILE__), 'apollo_crawler/formatter/json_formatter')
|
30
|
+
require File.join(File.dirname(__FILE__), 'apollo_crawler/formatter/plain_formatter')
|
31
|
+
require File.join(File.dirname(__FILE__), 'apollo_crawler/formatter/table_formatter')
|
32
|
+
|
33
|
+
# Loggers
|
34
|
+
require File.join(File.dirname(__FILE__), 'apollo_crawler/logger/base_logger')
|
35
|
+
require File.join(File.dirname(__FILE__), 'apollo_crawler/logger/console_logger')
|
36
|
+
|
37
|
+
# Stores
|
38
|
+
require File.join(File.dirname(__FILE__), 'apollo_crawler/store/base_store')
|
@@ -1,6 +1,6 @@
|
|
1
1
|
module Apollo
|
2
2
|
module Cache
|
3
|
-
class
|
3
|
+
class BaseCache
|
4
4
|
# Get value associated with key from cache
|
5
5
|
def get(key, *args)
|
6
6
|
|
@@ -32,6 +32,6 @@ module Apollo
|
|
32
32
|
def clear
|
33
33
|
return
|
34
34
|
end
|
35
|
-
end #
|
36
|
-
end # Cache
|
37
|
-
end # Apollo
|
35
|
+
end # class BaseCache
|
36
|
+
end # module Cache
|
37
|
+
end # module Apollo
|
@@ -1,10 +1,10 @@
|
|
1
|
-
require File.join(File.dirname(__FILE__), '
|
1
|
+
require File.join(File.dirname(__FILE__), 'base_cache')
|
2
2
|
|
3
3
|
require 'dalli'
|
4
4
|
|
5
5
|
module Apollo
|
6
6
|
module Cache
|
7
|
-
class
|
7
|
+
class MemcachedCache < BaseCache
|
8
8
|
@cache = nil
|
9
9
|
|
10
10
|
def initialize
|
@@ -46,6 +46,6 @@ module Apollo
|
|
46
46
|
def clear
|
47
47
|
# TODO: Implement
|
48
48
|
end
|
49
|
-
end #
|
50
|
-
end # Cache
|
51
|
-
end # Apollo
|
49
|
+
end # class MemcachedCache
|
50
|
+
end # module Cache
|
51
|
+
end # module Apollo
|
@@ -1,8 +1,8 @@
|
|
1
|
-
require File.join(File.dirname(__FILE__), '
|
1
|
+
require File.join(File.dirname(__FILE__), 'base_cache')
|
2
2
|
|
3
3
|
module Apollo
|
4
4
|
module Cache
|
5
|
-
class
|
5
|
+
class MemoryCache < BaseCache
|
6
6
|
@cache = nil
|
7
7
|
|
8
8
|
def initialize
|
@@ -41,6 +41,6 @@ module Apollo
|
|
41
41
|
def clear
|
42
42
|
@cache.clear
|
43
43
|
end
|
44
|
-
end #
|
45
|
-
end # Cache
|
46
|
-
end # Apollo
|
44
|
+
end # class MemoryCache
|
45
|
+
end # module Cache
|
46
|
+
end # module Apollo
|
@@ -1,8 +1,8 @@
|
|
1
|
-
require File.join(File.dirname(__FILE__), '
|
1
|
+
require File.join(File.dirname(__FILE__), 'base_cache')
|
2
2
|
|
3
3
|
module Apollo
|
4
4
|
module Cache
|
5
|
-
class
|
5
|
+
class NullCache < BaseCache
|
6
6
|
# Get value associated with key from cache
|
7
7
|
def get(key, *args)
|
8
8
|
# Not found, Create, cache and return
|
@@ -28,6 +28,6 @@ module Apollo
|
|
28
28
|
def invalidate(key)
|
29
29
|
return true
|
30
30
|
end
|
31
|
-
end #
|
31
|
+
end # NullCache
|
32
32
|
end # Cache
|
33
33
|
end # Apollo
|
@@ -14,40 +14,55 @@ module RbConfig
|
|
14
14
|
############################################################
|
15
15
|
#
|
16
16
|
# Filesystem backend
|
17
|
-
# CACHE_CLASS = Apollo::Cache::
|
17
|
+
# CACHE_CLASS = Apollo::Cache::FilesystemCache
|
18
18
|
#
|
19
19
|
# Memcached - expects localhost:11211
|
20
|
-
# CACHE_CLASS = Apollo::Cache::
|
20
|
+
# CACHE_CLASS = Apollo::Cache::MemcachedCache
|
21
21
|
#
|
22
22
|
# Pure naive ruby in-memory implementation
|
23
|
-
# CACHE_CLASS = Apollo::Cache::
|
23
|
+
# CACHE_CLASS = Apollo::Cache::MemoryCache
|
24
24
|
#
|
25
25
|
# Null caching - no caching at all
|
26
|
-
# CACHE_CLASS = Apollo::Cache::
|
26
|
+
# CACHE_CLASS = Apollo::Cache::NullCache
|
27
27
|
|
28
28
|
# Used caching mechanism by default
|
29
|
-
CACHE_CLASS = Apollo::Cache::
|
29
|
+
CACHE_CLASS = Apollo::Cache::MemcachedCache
|
30
30
|
|
31
31
|
|
32
32
|
|
33
33
|
############################################################
|
34
34
|
# Crawlers - Built-in out-of box working crawlers
|
35
35
|
############################################################
|
36
|
-
CRAWLERS_DIR = File.join(File.dirname(__FILE__), "
|
36
|
+
CRAWLERS_DIR = File.join(File.dirname(__FILE__), "crawler")
|
37
37
|
|
38
38
|
# Template used for generated crawlers
|
39
|
-
CRAWLER_TEMPLATE_NAME = "crawler_template.
|
39
|
+
CRAWLER_TEMPLATE_NAME = "crawler_template.trb"
|
40
40
|
|
41
41
|
# Path of template
|
42
|
-
CRAWLER_TEMPLATE_PATH = File.join(
|
42
|
+
CRAWLER_TEMPLATE_PATH = File.join(CRAWLERS_DIR, CRAWLER_TEMPLATE_NAME)
|
43
43
|
|
44
44
|
|
45
45
|
|
46
|
+
############################################################
|
47
|
+
# Fetchers - used for fetching documents
|
48
|
+
############################################################
|
49
|
+
FETCHERS_DIR = File.join(File.dirname(__FILE__), "fetcher")
|
50
|
+
|
51
|
+
DEFAULT_FETCHER = Apollo::Fetcher::SmartFetcher
|
52
|
+
|
53
|
+
|
46
54
|
############################################################
|
47
55
|
# Formatters - used for formatting crawled documents results
|
48
56
|
############################################################
|
49
|
-
FORMATTERS_DIR = File.join(File.dirname(__FILE__), "
|
57
|
+
FORMATTERS_DIR = File.join(File.dirname(__FILE__), "formatter")
|
50
58
|
|
51
59
|
# Default formatter if no other specified
|
52
|
-
DEFAULT_FORMATTER = Apollo::Formatter::
|
60
|
+
DEFAULT_FORMATTER = Apollo::Formatter::JsonFormatter
|
61
|
+
|
62
|
+
|
63
|
+
|
64
|
+
############################################################
|
65
|
+
# Loggers - used for formatting output messages
|
66
|
+
############################################################
|
67
|
+
DEFAULT_LOGGER = Apollo::Logger::ConsoleLogger
|
53
68
|
end # Config
|
@@ -3,13 +3,19 @@ require "nokogiri"
|
|
3
3
|
|
4
4
|
module Apollo
|
5
5
|
module Crawler
|
6
|
-
class
|
6
|
+
class BaseCrawler
|
7
|
+
|
8
|
+
|
7
9
|
@backlog = nil
|
8
10
|
|
9
11
|
def initialize
|
10
12
|
@backlog = []
|
11
13
|
end
|
12
14
|
|
15
|
+
def self.name_re()
|
16
|
+
return /crawler$/
|
17
|
+
end
|
18
|
+
|
13
19
|
# Name of the crawler
|
14
20
|
def name
|
15
21
|
return "Crawler Base"
|
@@ -19,6 +25,10 @@ module Apollo
|
|
19
25
|
return nil
|
20
26
|
end
|
21
27
|
|
28
|
+
def self.fetch(url)
|
29
|
+
RbConfig::DEFAULT_FETCHER.fetch(url)
|
30
|
+
end
|
31
|
+
|
22
32
|
def self.try_get_url(root, url)
|
23
33
|
begin
|
24
34
|
return URI.join(root, url)
|
@@ -28,7 +38,7 @@ module Apollo
|
|
28
38
|
end
|
29
39
|
|
30
40
|
def self.try_get_doc(root, url)
|
31
|
-
doc =
|
41
|
+
doc = BaseCrawler.try_get_url(root, url)
|
32
42
|
|
33
43
|
# TODO: Set experition header
|
34
44
|
return {
|
@@ -47,16 +57,19 @@ module Apollo
|
|
47
57
|
url = self.url
|
48
58
|
end
|
49
59
|
|
60
|
+
# TODO: Be more agressive, use assert, it is clients responsibility!
|
50
61
|
if(url.nil?)
|
51
62
|
return nil
|
52
63
|
end
|
53
64
|
|
65
|
+
# We support both - list of urls or single url
|
54
66
|
if(url.kind_of?(Array))
|
55
67
|
@backlog.concat(url)
|
56
68
|
else
|
57
69
|
@backlog << url
|
58
70
|
end
|
59
71
|
|
72
|
+
# Counter of processed documents (pages)
|
60
73
|
docs_processed = 0
|
61
74
|
|
62
75
|
res = []
|
@@ -66,34 +79,40 @@ module Apollo
|
|
66
79
|
|
67
80
|
# puts "Processing '#{url}'"
|
68
81
|
doc = self.process_url(url)
|
69
|
-
res << doc
|
70
82
|
|
71
|
-
#
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
83
|
+
# Increase counter of processed documents
|
84
|
+
docs_processed = docs_processed + 1
|
85
|
+
|
86
|
+
# Process document if was successfuly retreived
|
87
|
+
if(!doc.nil?)
|
88
|
+
# TODO: Use log4r and log it only on info level
|
89
|
+
if block_given?
|
90
|
+
yield doc
|
91
|
+
end
|
76
92
|
|
77
|
-
|
78
|
-
|
79
|
-
url = link[:link].to_s
|
80
|
-
# TODO: Use log4r and log it only on info level
|
81
|
-
#puts url
|
93
|
+
# Add document to queue of results
|
94
|
+
res << doc
|
82
95
|
|
83
|
-
|
84
|
-
|
96
|
+
# If
|
97
|
+
if(doc[:links].nil? == false)
|
98
|
+
doc[:links].each do |link|
|
99
|
+
url = link[:link].to_s
|
100
|
+
# TODO: Use log4r and log it only on info level
|
101
|
+
#puts url
|
102
|
+
|
103
|
+
# TODO: Check if it is unique
|
104
|
+
@backlog << url
|
105
|
+
end
|
85
106
|
end
|
86
107
|
end
|
87
108
|
|
88
|
-
#
|
89
|
-
docs_processed = docs_processed + 1
|
109
|
+
# Break if limit of documents to processed was reached
|
90
110
|
break if opts[:doc_limit] && docs_processed >= opts[:doc_limit]
|
91
111
|
end
|
92
112
|
return res
|
93
113
|
end
|
94
114
|
|
95
115
|
def process_url(url)
|
96
|
-
# Try fetch document
|
97
116
|
doc = self.fetch_document(url)
|
98
117
|
if(doc.nil?)
|
99
118
|
return nil
|
@@ -140,7 +159,7 @@ module Apollo
|
|
140
159
|
res = nil
|
141
160
|
while(attempt_no < max_attempts && success == false) do
|
142
161
|
begin
|
143
|
-
res =
|
162
|
+
res = BaseCrawler.fetch(url)
|
144
163
|
success = true
|
145
164
|
rescue Exception => e
|
146
165
|
puts "EXCEPTION: Unable to fetch '#{url}', reason: '#{e.to_s}'"
|
@@ -170,6 +189,6 @@ module Apollo
|
|
170
189
|
res = []
|
171
190
|
return res
|
172
191
|
end
|
173
|
-
end #
|
174
|
-
end # Crawler
|
175
|
-
end # Apollo
|
192
|
+
end # class BaseCrawler
|
193
|
+
end # module Crawler
|
194
|
+
end # module Apollo
|
@@ -1,8 +1,8 @@
|
|
1
|
-
require File.join(File.dirname(__FILE__), '
|
1
|
+
require File.join(File.dirname(__FILE__), 'base_crawler')
|
2
2
|
|
3
3
|
module Apollo
|
4
4
|
module Crawler
|
5
|
-
class
|
5
|
+
class GoogleCrawler < BaseCrawler
|
6
6
|
@@MATCHER_ITEM = "//h3/a"
|
7
7
|
|
8
8
|
def name()
|
@@ -15,7 +15,7 @@ module Apollo
|
|
15
15
|
|
16
16
|
def extract_data(doc)
|
17
17
|
res = doc.xpath(@@MATCHER_ITEM).map { | node |
|
18
|
-
url =
|
18
|
+
url = BaseCrawler.try_get_url(self.url, node['href'])
|
19
19
|
next if url.nil?
|
20
20
|
|
21
21
|
{
|
@@ -27,7 +27,7 @@ module Apollo
|
|
27
27
|
|
28
28
|
def extract_links(doc)
|
29
29
|
res = doc.xpath("(//td[@class = 'b']/a)[last()]").map { | node |
|
30
|
-
res_doc =
|
30
|
+
res_doc = BaseCrawler.try_get_url(self.url, node['href'])
|
31
31
|
next if url.nil?
|
32
32
|
|
33
33
|
{
|
@@ -35,6 +35,6 @@ module Apollo
|
|
35
35
|
}
|
36
36
|
}
|
37
37
|
end
|
38
|
-
end
|
39
|
-
end # Crawler
|
40
|
-
end # Apollo
|
38
|
+
end # class GoogleCrawler
|
39
|
+
end # module Crawler
|
40
|
+
end # module Apollo
|
@@ -1,8 +1,8 @@
|
|
1
|
-
require File.join(File.dirname(__FILE__), '
|
1
|
+
require File.join(File.dirname(__FILE__), 'base_crawler')
|
2
2
|
|
3
3
|
module Apollo
|
4
4
|
module Crawler
|
5
|
-
class
|
5
|
+
class HackerNewsCrawler < BaseCrawler
|
6
6
|
@@MATCHER_ITEM = "(//td[@class = 'title']/a)[not(position() > last() -1)]"
|
7
7
|
|
8
8
|
def name()
|
@@ -15,7 +15,7 @@ module Apollo
|
|
15
15
|
|
16
16
|
def extract_data(doc)
|
17
17
|
res = doc.xpath(@@MATCHER_ITEM).map { |node|
|
18
|
-
url =
|
18
|
+
url = BaseCrawler.try_get_url(self.url, node['href'])
|
19
19
|
next if url.nil?
|
20
20
|
|
21
21
|
{
|
@@ -29,7 +29,7 @@ module Apollo
|
|
29
29
|
|
30
30
|
def extract_links(doc)
|
31
31
|
res = doc.xpath("(//a[@class = 'prevnextbutact'])").map { |node|
|
32
|
-
url =
|
32
|
+
url = BaseCrawler.try_get_url(self.url, node['href'])
|
33
33
|
next if url.nil?
|
34
34
|
|
35
35
|
{
|
@@ -39,6 +39,6 @@ module Apollo
|
|
39
39
|
|
40
40
|
return res.uniq
|
41
41
|
end
|
42
|
-
end
|
43
|
-
end # Crawler
|
44
|
-
end # Apollo
|
42
|
+
end # class HackerNewsCrawler
|
43
|
+
end # module Crawler
|
44
|
+
end # module Apollo
|
@@ -1,8 +1,8 @@
|
|
1
|
-
require File.join(File.dirname(__FILE__), '
|
1
|
+
require File.join(File.dirname(__FILE__), 'base_crawler')
|
2
2
|
|
3
3
|
module Apollo
|
4
4
|
module Crawler
|
5
|
-
class
|
5
|
+
class SlashdotCrawler < BaseCrawler
|
6
6
|
@@MATCHER_ITEM = "//article/header/h2/span/a"
|
7
7
|
|
8
8
|
def name()
|
@@ -15,7 +15,7 @@ module Apollo
|
|
15
15
|
|
16
16
|
def extract_data(doc)
|
17
17
|
res = doc.xpath(@@MATCHER_ITEM).map { | node |
|
18
|
-
url =
|
18
|
+
url = BaseCrawler.try_get_url(self.url, node['href'])
|
19
19
|
next if url.nil?
|
20
20
|
|
21
21
|
{
|
@@ -27,7 +27,7 @@ module Apollo
|
|
27
27
|
|
28
28
|
def extract_links(doc)
|
29
29
|
res = doc.xpath(@@MATCHER_ITEM).map { | node |
|
30
|
-
url =
|
30
|
+
url = BaseCrawler.try_get_url(self.url, node['href'])
|
31
31
|
next if url.nil?
|
32
32
|
|
33
33
|
{
|
@@ -35,6 +35,6 @@ module Apollo
|
|
35
35
|
}
|
36
36
|
}
|
37
37
|
end
|
38
|
-
end
|
39
|
-
end # Crawler
|
40
|
-
end # Apollo
|
38
|
+
end # class SlashdotCrawler
|
39
|
+
end # module Crawler
|
40
|
+
end # module Apollo
|
data/lib/apollo_crawler/crawler/{stackoverflow_com/stackoverflow.rb → stackoverflow_crawler.rb}
RENAMED
@@ -1,8 +1,8 @@
|
|
1
|
-
require File.join(File.dirname(__FILE__), '
|
1
|
+
require File.join(File.dirname(__FILE__), 'base_crawler')
|
2
2
|
|
3
3
|
module Apollo
|
4
4
|
module Crawler
|
5
|
-
class
|
5
|
+
class StackoverflowCrawler < BaseCrawler
|
6
6
|
@@MATCHER_ITEM = "//div[@class = 'summary']/h3/a"
|
7
7
|
|
8
8
|
def name()
|
@@ -15,7 +15,7 @@ module Apollo
|
|
15
15
|
|
16
16
|
def extract_data(doc)
|
17
17
|
res = doc.xpath(@@MATCHER_ITEM).map { |node|
|
18
|
-
url =
|
18
|
+
url = BaseCrawler.try_get_url(self.url, node['href'])
|
19
19
|
next if url.nil?
|
20
20
|
|
21
21
|
{
|
@@ -29,7 +29,7 @@ module Apollo
|
|
29
29
|
|
30
30
|
def extract_links(doc)
|
31
31
|
res = doc.xpath("(//div[@class = 'pager fl']/a)[last()]").map { |node|
|
32
|
-
url =
|
32
|
+
url = BaseCrawler.try_get_url(self.url, node['href'])
|
33
33
|
next if url.nil?
|
34
34
|
|
35
35
|
{
|
@@ -39,6 +39,6 @@ module Apollo
|
|
39
39
|
|
40
40
|
return res.uniq
|
41
41
|
end
|
42
|
-
end
|
43
|
-
end # Crawler
|
44
|
-
end # Apollo
|
42
|
+
end # class StackoverflowCrawler
|
43
|
+
end # module Crawler
|
44
|
+
end # module Apollo
|
@@ -1,8 +1,8 @@
|
|
1
|
-
require File.join(File.dirname(__FILE__), '
|
1
|
+
require File.join(File.dirname(__FILE__), 'base_crawler')
|
2
2
|
|
3
3
|
module Apollo
|
4
4
|
module Crawler
|
5
|
-
class
|
5
|
+
class XkcdCrawler < BaseCrawler
|
6
6
|
@@MATCHER_ITEM = "//div[@id = 'comic']/img"
|
7
7
|
|
8
8
|
def name()
|
@@ -30,6 +30,6 @@ module Apollo
|
|
30
30
|
}
|
31
31
|
res.uniq
|
32
32
|
end
|
33
|
-
end
|
34
|
-
end # Crawler
|
35
|
-
end # Apollo
|
33
|
+
end # class XkcdCrawler
|
34
|
+
end # module Crawler
|
35
|
+
end # module Apollo
|
@@ -1,8 +1,15 @@
|
|
1
|
-
require
|
1
|
+
require "open-uri"
|
2
|
+
require "nokogiri"
|
3
|
+
|
4
|
+
require File.join(File.dirname(__FILE__), 'base_fetcher')
|
2
5
|
|
3
6
|
module Apollo
|
4
7
|
module Fetcher
|
5
|
-
class SimpleFetcher <
|
6
|
-
|
7
|
-
|
8
|
-
|
8
|
+
class SimpleFetcher < BaseFetcher
|
9
|
+
def self.fetch(url)
|
10
|
+
# TODO: Throw exception ???
|
11
|
+
return open(url).read
|
12
|
+
end
|
13
|
+
end # class SimpleFetcher
|
14
|
+
end # module Fetcher
|
15
|
+
end # module Apollo
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require "open-uri"
|
2
|
+
require "nokogiri"
|
3
|
+
|
4
|
+
require File.join(File.dirname(__FILE__), 'base_fetcher')
|
5
|
+
|
6
|
+
module Apollo
|
7
|
+
module Fetcher
|
8
|
+
class SmartFetcher < BaseFetcher
|
9
|
+
def self.fetch(url)
|
10
|
+
# TODO: Throw exception ???
|
11
|
+
return open(url).read
|
12
|
+
end
|
13
|
+
end # class SimpleFetcher
|
14
|
+
end # module SmartFetcher
|
15
|
+
end # module Apollo
|
@@ -1,10 +1,10 @@
|
|
1
1
|
require 'json'
|
2
2
|
|
3
|
-
require File.join(File.dirname(__FILE__), '
|
3
|
+
require File.join(File.dirname(__FILE__), 'base_formatter')
|
4
4
|
|
5
5
|
module Apollo
|
6
6
|
module Formatter
|
7
|
-
class
|
7
|
+
class JsonFormatter < BaseFormatter
|
8
8
|
def format(obj)
|
9
9
|
return Json.format(obj)
|
10
10
|
end
|
@@ -12,6 +12,6 @@ module Apollo
|
|
12
12
|
def self.format(obj)
|
13
13
|
return JSON.pretty_generate(obj)
|
14
14
|
end
|
15
|
-
end
|
16
|
-
end # Formatter
|
17
|
-
end # Apollo
|
15
|
+
end # class JsonFormatter
|
16
|
+
end # module Formatter
|
17
|
+
end # module Apollo
|
@@ -0,0 +1,17 @@
|
|
1
|
+
require 'awesome_print'
|
2
|
+
|
3
|
+
require File.join(File.dirname(__FILE__), 'base_formatter')
|
4
|
+
|
5
|
+
module Apollo
|
6
|
+
module Formatter
|
7
|
+
class PlainFormatter < BaseFormatter
|
8
|
+
def format(obj)
|
9
|
+
return Plain.format(obj)
|
10
|
+
end
|
11
|
+
|
12
|
+
def self.format(obj)
|
13
|
+
return obj.inspect
|
14
|
+
end
|
15
|
+
end # class PlainFormatter
|
16
|
+
end # module Formatter
|
17
|
+
end # module Apollo
|
@@ -1,10 +1,10 @@
|
|
1
1
|
require 'terminal-table'
|
2
2
|
|
3
|
-
require File.join(File.dirname(__FILE__), '
|
3
|
+
require File.join(File.dirname(__FILE__), 'base_formatter')
|
4
4
|
|
5
5
|
module Apollo
|
6
6
|
module Formatter
|
7
|
-
class
|
7
|
+
class TableFormatter < BaseFormatter
|
8
8
|
def format(obj)
|
9
9
|
return Table.format(obj)
|
10
10
|
end
|
@@ -30,6 +30,6 @@ module Apollo
|
|
30
30
|
table = Terminal::Table.new :headings => headings, :rows => rows
|
31
31
|
return table
|
32
32
|
end
|
33
|
-
end
|
34
|
-
end # Formatter
|
35
|
-
end # Apollo
|
33
|
+
end # class TableFormatter
|
34
|
+
end # module Formatter
|
35
|
+
end # module Apollo
|
data/lib/apollo_crawler/lib.rb
CHANGED
@@ -2,27 +2,34 @@
|
|
2
2
|
require File.join(File.dirname(__FILE__), 'program')
|
3
3
|
|
4
4
|
# Caches
|
5
|
-
require File.join(File.dirname(__FILE__), 'cache/
|
5
|
+
require File.join(File.dirname(__FILE__), 'cache/base_cache')
|
6
6
|
require File.join(File.dirname(__FILE__), 'cache/factory')
|
7
|
-
require File.join(File.dirname(__FILE__), 'cache/filesystem_cache')
|
8
7
|
require File.join(File.dirname(__FILE__), 'cache/memcached_cache')
|
9
8
|
require File.join(File.dirname(__FILE__), 'cache/memory_cache')
|
10
9
|
require File.join(File.dirname(__FILE__), 'cache/null_cache')
|
11
10
|
|
12
11
|
# Crawlers
|
13
|
-
require File.join(File.dirname(__FILE__), 'crawler/
|
14
|
-
require File.join(File.dirname(__FILE__), 'crawler/
|
15
|
-
require File.join(File.dirname(__FILE__), 'crawler/
|
16
|
-
require File.join(File.dirname(__FILE__), 'crawler/
|
17
|
-
require File.join(File.dirname(__FILE__), 'crawler/
|
18
|
-
require File.join(File.dirname(__FILE__), 'crawler/
|
12
|
+
require File.join(File.dirname(__FILE__), 'crawler/base_crawler')
|
13
|
+
require File.join(File.dirname(__FILE__), 'crawler/google_crawler')
|
14
|
+
require File.join(File.dirname(__FILE__), 'crawler/hacker_news_crawler')
|
15
|
+
require File.join(File.dirname(__FILE__), 'crawler/slashdot_crawler')
|
16
|
+
require File.join(File.dirname(__FILE__), 'crawler/stackoverflow_crawler')
|
17
|
+
require File.join(File.dirname(__FILE__), 'crawler/xkcd_crawler')
|
19
18
|
|
20
19
|
# Fetchers
|
21
|
-
require File.join(File.dirname(__FILE__), 'fetcher/
|
20
|
+
require File.join(File.dirname(__FILE__), 'fetcher/base_fetcher')
|
22
21
|
require File.join(File.dirname(__FILE__), 'fetcher/simple_fetcher')
|
22
|
+
require File.join(File.dirname(__FILE__), 'fetcher/smart_fetcher')
|
23
23
|
|
24
24
|
# Formatters
|
25
|
-
require File.join(File.dirname(__FILE__), 'formatter/
|
26
|
-
require File.join(File.dirname(__FILE__), 'formatter/
|
27
|
-
require File.join(File.dirname(__FILE__), 'formatter/
|
28
|
-
require File.join(File.dirname(__FILE__), 'formatter/
|
25
|
+
require File.join(File.dirname(__FILE__), 'formatter/base_formatter')
|
26
|
+
require File.join(File.dirname(__FILE__), 'formatter/json_formatter')
|
27
|
+
require File.join(File.dirname(__FILE__), 'formatter/plain_formatter')
|
28
|
+
require File.join(File.dirname(__FILE__), 'formatter/table_formatter')
|
29
|
+
|
30
|
+
# Loggers
|
31
|
+
require File.join(File.dirname(__FILE__), 'logger/base_logger')
|
32
|
+
require File.join(File.dirname(__FILE__), 'logger/console_logger')
|
33
|
+
|
34
|
+
# Stores
|
35
|
+
require File.join(File.dirname(__FILE__), 'store/base_store')
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), 'base_logger')
|
2
|
+
|
3
|
+
module Apollo
|
4
|
+
module Logger
|
5
|
+
class ConsoleLogger < BaseLogger
|
6
|
+
def log(msg)
|
7
|
+
puts msg
|
8
|
+
end
|
9
|
+
|
10
|
+
def self.log(msg)
|
11
|
+
return Logger.log(msg)
|
12
|
+
end
|
13
|
+
end # class ConsoleLogger
|
14
|
+
end # module Logger
|
15
|
+
end # module Apollo
|
@@ -125,7 +125,7 @@ module Apollo
|
|
125
125
|
config = File.join(File.dirname(__FILE__), "config", "crawler.rb")
|
126
126
|
if(File.exists?(config))
|
127
127
|
if(@options[:verbose])
|
128
|
-
|
128
|
+
RbConfig::DEFAULT_LOGGER "Loading config '#{config}'"
|
129
129
|
end
|
130
130
|
|
131
131
|
# puts "Let's require '#{@options[:verbose]}'"
|
@@ -192,7 +192,8 @@ module Apollo
|
|
192
192
|
|
193
193
|
tmp.each do |x|
|
194
194
|
klass = Object.const_get('Apollo').const_get('Crawler').const_get(x)
|
195
|
-
|
195
|
+
name = x.to_s.downcase.gsub(Apollo::Crawler::BaseCrawler.name_re,"")
|
196
|
+
@crawlers.merge!({ name => klass})
|
196
197
|
end
|
197
198
|
|
198
199
|
if(@options[:verbose])
|
@@ -225,7 +226,8 @@ module Apollo
|
|
225
226
|
|
226
227
|
tmp.each do |x|
|
227
228
|
klass = Object.const_get('Apollo').const_get('Formatter').const_get(x)
|
228
|
-
|
229
|
+
name = x.to_s.downcase.gsub(Apollo::Formatter::BaseFormatter.name_re,"")
|
230
|
+
@formatters.merge!({ name => klass})
|
229
231
|
end
|
230
232
|
|
231
233
|
if(@options[:verbose])
|
@@ -266,9 +268,10 @@ module Apollo
|
|
266
268
|
end
|
267
269
|
|
268
270
|
template_path = RbConfig::CRAWLER_TEMPLATE_PATH
|
271
|
+
puts template_path
|
269
272
|
if(File.exists?(template_path) == false)
|
270
273
|
puts "Template file '#{template_path}' does not exists!"
|
271
|
-
return
|
274
|
+
return -1
|
272
275
|
end
|
273
276
|
|
274
277
|
if(@options[:verbose])
|
@@ -301,6 +304,8 @@ module Apollo
|
|
301
304
|
end
|
302
305
|
end
|
303
306
|
end
|
307
|
+
|
308
|
+
return 0
|
304
309
|
end
|
305
310
|
|
306
311
|
def self.console_table(headings, rows)
|
@@ -342,8 +347,7 @@ module Apollo
|
|
342
347
|
url = args.length > 0 ? args[0] : nil
|
343
348
|
matcher = args.length > 1 ? args[1] : nil
|
344
349
|
|
345
|
-
self.generate_crawler(name, url, matcher)
|
346
|
-
return 0
|
350
|
+
return self.generate_crawler(name, url, matcher)
|
347
351
|
end
|
348
352
|
|
349
353
|
register_modules()
|
@@ -356,7 +360,8 @@ module Apollo
|
|
356
360
|
|
357
361
|
# Look for specified formatter
|
358
362
|
f = @formatters.select { |k, v|
|
359
|
-
|
363
|
+
name = formatter_name.gsub(Apollo::Formatter::BaseFormatter::name_re, "")
|
364
|
+
k.downcase == name
|
360
365
|
}
|
361
366
|
|
362
367
|
if(f)
|
@@ -388,7 +393,9 @@ module Apollo
|
|
388
393
|
end
|
389
394
|
|
390
395
|
crawlers.each do |crawler|
|
391
|
-
|
396
|
+
crawler_name = crawler.downcase.gsub(Apollo::Crawler::BaseCrawler.name_re, "")
|
397
|
+
|
398
|
+
p = @crawlers[crawler_name]
|
392
399
|
if(p == nil)
|
393
400
|
puts "Invalid crawler name - '#{crawler}'"
|
394
401
|
puts "See program help"
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: apollo-crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tomas Korcak
|
@@ -227,30 +227,31 @@ executables:
|
|
227
227
|
extensions: []
|
228
228
|
extra_rdoc_files: []
|
229
229
|
files:
|
230
|
+
- ./lib/apollo_crawler/fetcher/smart_fetcher.rb
|
230
231
|
- ./lib/apollo_crawler/fetcher/simple_fetcher.rb
|
231
|
-
- ./lib/apollo_crawler/fetcher/
|
232
|
+
- ./lib/apollo_crawler/fetcher/base_fetcher.rb
|
232
233
|
- ./lib/apollo_crawler/lib.rb
|
233
234
|
- ./lib/apollo_crawler/version.rb
|
235
|
+
- ./lib/apollo_crawler/logger/console_logger.rb
|
236
|
+
- ./lib/apollo_crawler/logger/base_logger.rb
|
234
237
|
- ./lib/apollo_crawler/program.rb
|
235
238
|
- ./lib/apollo_crawler/config.rb
|
236
239
|
- ./lib/apollo_crawler/cache/factory.rb
|
237
240
|
- ./lib/apollo_crawler/cache/null_cache.rb
|
238
|
-
- ./lib/apollo_crawler/cache/cache_base.rb
|
239
241
|
- ./lib/apollo_crawler/cache/memory_cache.rb
|
240
|
-
- ./lib/apollo_crawler/cache/
|
242
|
+
- ./lib/apollo_crawler/cache/base_cache.rb
|
241
243
|
- ./lib/apollo_crawler/cache/memcached_cache.rb
|
242
|
-
- ./lib/apollo_crawler/crawler/
|
243
|
-
- ./lib/apollo_crawler/crawler/
|
244
|
-
- ./lib/apollo_crawler/crawler/
|
245
|
-
- ./lib/apollo_crawler/crawler/
|
246
|
-
- ./lib/apollo_crawler/crawler/
|
247
|
-
- ./lib/apollo_crawler/crawler/
|
248
|
-
- ./lib/apollo_crawler/
|
249
|
-
- ./lib/apollo_crawler/formatter/
|
250
|
-
- ./lib/apollo_crawler/formatter/
|
251
|
-
- ./lib/apollo_crawler/formatter/
|
252
|
-
- ./lib/apollo_crawler/
|
253
|
-
- ./lib/apollo_crawler/store/store_base.rb
|
244
|
+
- ./lib/apollo_crawler/crawler/xkcd_crawler.rb
|
245
|
+
- ./lib/apollo_crawler/crawler/google_crawler.rb
|
246
|
+
- ./lib/apollo_crawler/crawler/slashdot_crawler.rb
|
247
|
+
- ./lib/apollo_crawler/crawler/hacker_news_crawler.rb
|
248
|
+
- ./lib/apollo_crawler/crawler/base_crawler.rb
|
249
|
+
- ./lib/apollo_crawler/crawler/stackoverflow_crawler.rb
|
250
|
+
- ./lib/apollo_crawler/formatter/table_formatter.rb
|
251
|
+
- ./lib/apollo_crawler/formatter/base_formatter.rb
|
252
|
+
- ./lib/apollo_crawler/formatter/json_formatter.rb
|
253
|
+
- ./lib/apollo_crawler/formatter/plain_formatter.rb
|
254
|
+
- ./lib/apollo_crawler/store/base_store.rb
|
254
255
|
- ./lib/apollo_crawler.rb
|
255
256
|
- bin/apollo-crawler
|
256
257
|
homepage: https://github.com/korczis/apollo-crawler
|
@@ -1,37 +0,0 @@
|
|
1
|
-
require File.join(File.dirname(__FILE__), 'cache_base')
|
2
|
-
|
3
|
-
module Apollo
|
4
|
-
module Cache
|
5
|
-
class Filesystem < CacheBase
|
6
|
-
def initialize
|
7
|
-
# puts "This if Filesystem cache"
|
8
|
-
end
|
9
|
-
|
10
|
-
# Get value associated with key from cache
|
11
|
-
def get(key, *args)
|
12
|
-
# Not found, Create, cache and return
|
13
|
-
if block_given?
|
14
|
-
res = yield args
|
15
|
-
end
|
16
|
-
|
17
|
-
return res
|
18
|
-
end
|
19
|
-
|
20
|
-
# Set value associated with key
|
21
|
-
# Return cached value
|
22
|
-
def set(key, value)
|
23
|
-
return value
|
24
|
-
end
|
25
|
-
|
26
|
-
# Check if cache contains specified key
|
27
|
-
def contains(key)
|
28
|
-
return false
|
29
|
-
end
|
30
|
-
|
31
|
-
# Invalidate key/value pair
|
32
|
-
def invalidate(key)
|
33
|
-
return true
|
34
|
-
end
|
35
|
-
end # Filesystem
|
36
|
-
end # Cache
|
37
|
-
end # Apollo
|
@@ -1,24 +0,0 @@
|
|
1
|
-
module Apollo
|
2
|
-
module Crawler
|
3
|
-
class CRAWLER_CLASS_NAME < Crawler
|
4
|
-
@@MATCHER_ITEM = "CRAWLER_MATCHER"
|
5
|
-
|
6
|
-
def name()
|
7
|
-
return "CRAWLER_NAME"
|
8
|
-
end
|
9
|
-
|
10
|
-
def url()
|
11
|
-
return "CRAWLER_URL"
|
12
|
-
end
|
13
|
-
|
14
|
-
def extract_data(doc)
|
15
|
-
res = doc.xpath(@@MATCHER_ITEM).map { |i|
|
16
|
-
{
|
17
|
-
:text => i.text,
|
18
|
-
:link => URI.join(self.url, i['href'])
|
19
|
-
}
|
20
|
-
}
|
21
|
-
end
|
22
|
-
end # CRAWLER_CLASS_NAME
|
23
|
-
end # Crawler
|
24
|
-
end # Apollo
|
@@ -1,17 +0,0 @@
|
|
1
|
-
require 'awesome_print'
|
2
|
-
|
3
|
-
require File.join(File.dirname(__FILE__), 'formatter_base')
|
4
|
-
|
5
|
-
module Apollo
|
6
|
-
module Formatter
|
7
|
-
class Plain < FormatterBase
|
8
|
-
def format(obj)
|
9
|
-
return Plain.format(obj)
|
10
|
-
end
|
11
|
-
|
12
|
-
def self.format(obj)
|
13
|
-
return obj.inspect
|
14
|
-
end
|
15
|
-
end
|
16
|
-
end # Formatter
|
17
|
-
end # Apollo
|