apollo-crawler 0.1.4 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/bin/apollo-crawler +410 -405
- data/lib/apollo_crawler.rb +20 -20
- data/lib/apollo_crawler/cache.rb +34 -34
- data/lib/apollo_crawler/caches/factory.rb +30 -30
- data/lib/apollo_crawler/caches/filesystem_cache.rb +34 -34
- data/lib/apollo_crawler/caches/memory_cache.rb +43 -43
- data/lib/apollo_crawler/caches/null_cache.rb +30 -30
- data/lib/apollo_crawler/crawler.rb +154 -127
- data/lib/apollo_crawler/crawler_template.rb +24 -24
- data/lib/apollo_crawler/crawlers/google_com/google.rb +40 -26
- data/lib/apollo_crawler/crawlers/slashdot_org/slashdot.rb +40 -26
- data/lib/apollo_crawler/crawlers/stackoverflow_com/stackoverflow.rb +44 -26
- data/lib/apollo_crawler/crawlers/xkcd_com/xkcd.rb +35 -35
- data/lib/apollo_crawler/crawlers/ycombinator_com/hacker_news.rb +44 -26
- data/lib/apollo_crawler/formatter.rb +6 -6
- data/lib/apollo_crawler/formatters/formatter_json.rb +17 -17
- data/lib/apollo_crawler/formatters/formatter_plain.rb +17 -17
- data/lib/apollo_crawler/formatters/formatter_table.rb +35 -33
- data/lib/apollo_crawler/version.rb +2 -2
- metadata +12 -14
- data/lib/apollo_crawler/crawlers/alexa_com/alexa.rb +0 -26
- data/lib/apollo_crawler/crawlers/firmy_cz/firmy.rb +0 -26
data/lib/apollo_crawler.rb
CHANGED
@@ -1,20 +1,20 @@
|
|
1
|
-
# Main
|
2
|
-
require 'apollo_crawler/cache'
|
3
|
-
require 'apollo_crawler/crawler'
|
4
|
-
require 'apollo_crawler/formatter'
|
5
|
-
|
6
|
-
# Caches
|
7
|
-
require 'apollo_crawler/caches/factory'
|
8
|
-
require 'apollo_crawler/caches/memory_cache'
|
9
|
-
require 'apollo_crawler/caches/null_cache'
|
10
|
-
|
11
|
-
# Crawlers
|
12
|
-
require 'apollo_crawler/crawlers/alexa_com/alexa'
|
13
|
-
require 'apollo_crawler/crawlers/firmy_cz/firmy'
|
14
|
-
require 'apollo_crawler/crawlers/slashdot_org/slashdot'
|
15
|
-
require 'apollo_crawler/crawlers/ycombinator_com/hacker_news'
|
16
|
-
|
17
|
-
# Formatters
|
18
|
-
require 'apollo_crawler/formatters/formatter_json'
|
19
|
-
require 'apollo_crawler/formatters/formatter_plain'
|
20
|
-
require 'apollo_crawler/formatters/formatter_table'
|
1
|
+
# Main
|
2
|
+
require 'apollo_crawler/cache'
|
3
|
+
require 'apollo_crawler/crawler'
|
4
|
+
require 'apollo_crawler/formatter'
|
5
|
+
|
6
|
+
# Caches
|
7
|
+
require 'apollo_crawler/caches/factory'
|
8
|
+
require 'apollo_crawler/caches/memory_cache'
|
9
|
+
require 'apollo_crawler/caches/null_cache'
|
10
|
+
|
11
|
+
# Crawlers
|
12
|
+
require 'apollo_crawler/crawlers/alexa_com/alexa'
|
13
|
+
require 'apollo_crawler/crawlers/firmy_cz/firmy'
|
14
|
+
require 'apollo_crawler/crawlers/slashdot_org/slashdot'
|
15
|
+
require 'apollo_crawler/crawlers/ycombinator_com/hacker_news'
|
16
|
+
|
17
|
+
# Formatters
|
18
|
+
require 'apollo_crawler/formatters/formatter_json'
|
19
|
+
require 'apollo_crawler/formatters/formatter_plain'
|
20
|
+
require 'apollo_crawler/formatters/formatter_table'
|
data/lib/apollo_crawler/cache.rb
CHANGED
@@ -1,34 +1,34 @@
|
|
1
|
-
module Apollo
|
2
|
-
module Caches
|
3
|
-
class Cache
|
4
|
-
# Get value associated with key from cache
|
5
|
-
def get(key, *args)
|
6
|
-
|
7
|
-
# Not found, Create, cache and return
|
8
|
-
res = yield args
|
9
|
-
return res
|
10
|
-
end
|
11
|
-
|
12
|
-
# Set value associated with key
|
13
|
-
# Return cached value
|
14
|
-
def set(key, value)
|
15
|
-
return value
|
16
|
-
end
|
17
|
-
|
18
|
-
# Check if cache contains specified key
|
19
|
-
def contains(key)
|
20
|
-
return false
|
21
|
-
end
|
22
|
-
|
23
|
-
# Invalidate key/value pair
|
24
|
-
def invalidate(key)
|
25
|
-
return true
|
26
|
-
end
|
27
|
-
|
28
|
-
# Clear cache
|
29
|
-
def clear
|
30
|
-
return
|
31
|
-
end
|
32
|
-
end # Cache
|
33
|
-
end # Caches
|
34
|
-
end # Apollo
|
1
|
+
module Apollo
|
2
|
+
module Caches
|
3
|
+
class Cache
|
4
|
+
# Get value associated with key from cache
|
5
|
+
def get(key, *args)
|
6
|
+
|
7
|
+
# Not found, Create, cache and return
|
8
|
+
res = yield args
|
9
|
+
return res
|
10
|
+
end
|
11
|
+
|
12
|
+
# Set value associated with key
|
13
|
+
# Return cached value
|
14
|
+
def set(key, value)
|
15
|
+
return value
|
16
|
+
end
|
17
|
+
|
18
|
+
# Check if cache contains specified key
|
19
|
+
def contains(key)
|
20
|
+
return false
|
21
|
+
end
|
22
|
+
|
23
|
+
# Invalidate key/value pair
|
24
|
+
def invalidate(key)
|
25
|
+
return true
|
26
|
+
end
|
27
|
+
|
28
|
+
# Clear cache
|
29
|
+
def clear
|
30
|
+
return
|
31
|
+
end
|
32
|
+
end # Cache
|
33
|
+
end # Caches
|
34
|
+
end # Apollo
|
@@ -1,30 +1,30 @@
|
|
1
|
-
require File.join(File.dirname(__FILE__), '..', 'cache')
|
2
|
-
require 'singleton'
|
3
|
-
|
4
|
-
module Apollo
|
5
|
-
module Caches
|
6
|
-
class Factory
|
7
|
-
include Singleton
|
8
|
-
|
9
|
-
def self.construct()
|
10
|
-
self.singleton.construct()
|
11
|
-
end
|
12
|
-
|
13
|
-
def construct()
|
14
|
-
# Basic implementation
|
15
|
-
# res = Cache.new()
|
16
|
-
|
17
|
-
# Filesystem implementation
|
18
|
-
res = Filesystem.new()
|
19
|
-
|
20
|
-
# In-memory implementation
|
21
|
-
# res = Memory.new()
|
22
|
-
|
23
|
-
# Null (Dummy) implementation
|
24
|
-
# res = Null.new()
|
25
|
-
|
26
|
-
return res
|
27
|
-
end
|
28
|
-
end # Factory
|
29
|
-
end # Caches
|
30
|
-
end # Apollo
|
1
|
+
require File.join(File.dirname(__FILE__), '..', 'cache')
|
2
|
+
require 'singleton'
|
3
|
+
|
4
|
+
module Apollo
|
5
|
+
module Caches
|
6
|
+
class Factory
|
7
|
+
include Singleton
|
8
|
+
|
9
|
+
def self.construct()
|
10
|
+
self.singleton.construct()
|
11
|
+
end
|
12
|
+
|
13
|
+
def construct()
|
14
|
+
# Basic implementation
|
15
|
+
# res = Cache.new()
|
16
|
+
|
17
|
+
# Filesystem implementation
|
18
|
+
res = Filesystem.new()
|
19
|
+
|
20
|
+
# In-memory implementation
|
21
|
+
# res = Memory.new()
|
22
|
+
|
23
|
+
# Null (Dummy) implementation
|
24
|
+
# res = Null.new()
|
25
|
+
|
26
|
+
return res
|
27
|
+
end
|
28
|
+
end # Factory
|
29
|
+
end # Caches
|
30
|
+
end # Apollo
|
@@ -1,34 +1,34 @@
|
|
1
|
-
require File.join(File.dirname(__FILE__), '..', 'cache')
|
2
|
-
|
3
|
-
module Apollo
|
4
|
-
module Caches
|
5
|
-
class Filesystem < Cache
|
6
|
-
def initialize
|
7
|
-
# puts "This if Filesystem cache"
|
8
|
-
end
|
9
|
-
|
10
|
-
# Get value associated with key from cache
|
11
|
-
def get(key, *args)
|
12
|
-
# Not found, Create, cache and return
|
13
|
-
res = yield args
|
14
|
-
return res
|
15
|
-
end
|
16
|
-
|
17
|
-
# Set value associated with key
|
18
|
-
# Return cached value
|
19
|
-
def set(key, value)
|
20
|
-
return value
|
21
|
-
end
|
22
|
-
|
23
|
-
# Check if cache contains specified key
|
24
|
-
def contains(key)
|
25
|
-
return false
|
26
|
-
end
|
27
|
-
|
28
|
-
# Invalidate key/value pair
|
29
|
-
def invalidate(key)
|
30
|
-
return true
|
31
|
-
end
|
32
|
-
end # Filesystem
|
33
|
-
end # Caches
|
34
|
-
end # Apollo
|
1
|
+
require File.join(File.dirname(__FILE__), '..', 'cache')
|
2
|
+
|
3
|
+
module Apollo
|
4
|
+
module Caches
|
5
|
+
class Filesystem < Cache
|
6
|
+
def initialize
|
7
|
+
# puts "This if Filesystem cache"
|
8
|
+
end
|
9
|
+
|
10
|
+
# Get value associated with key from cache
|
11
|
+
def get(key, *args)
|
12
|
+
# Not found, Create, cache and return
|
13
|
+
res = yield args
|
14
|
+
return res
|
15
|
+
end
|
16
|
+
|
17
|
+
# Set value associated with key
|
18
|
+
# Return cached value
|
19
|
+
def set(key, value)
|
20
|
+
return value
|
21
|
+
end
|
22
|
+
|
23
|
+
# Check if cache contains specified key
|
24
|
+
def contains(key)
|
25
|
+
return false
|
26
|
+
end
|
27
|
+
|
28
|
+
# Invalidate key/value pair
|
29
|
+
def invalidate(key)
|
30
|
+
return true
|
31
|
+
end
|
32
|
+
end # Filesystem
|
33
|
+
end # Caches
|
34
|
+
end # Apollo
|
@@ -1,43 +1,43 @@
|
|
1
|
-
require File.join(File.dirname(__FILE__), '..', 'cache')
|
2
|
-
|
3
|
-
module Apollo
|
4
|
-
module Caches
|
5
|
-
class Memory < Cache
|
6
|
-
@storage = nil
|
7
|
-
|
8
|
-
def initialize
|
9
|
-
@storage = {}
|
10
|
-
end
|
11
|
-
|
12
|
-
# Get value associated with key from cache
|
13
|
-
def get(key, *args)
|
14
|
-
@storage[key]
|
15
|
-
|
16
|
-
# Not found, Create, cache and return
|
17
|
-
res = yield args
|
18
|
-
return res
|
19
|
-
end
|
20
|
-
|
21
|
-
# Set value associated with key
|
22
|
-
# Return cached value
|
23
|
-
def set(key, value)
|
24
|
-
@storage[key] = value
|
25
|
-
end
|
26
|
-
|
27
|
-
# Check if cache contains specified key
|
28
|
-
def contains(key)
|
29
|
-
@storage.has_key?(key)
|
30
|
-
end
|
31
|
-
|
32
|
-
# Invalidate key/value pair
|
33
|
-
def invalidate(key)
|
34
|
-
@storage.delete(key)
|
35
|
-
end
|
36
|
-
|
37
|
-
# Clear cache
|
38
|
-
def clear
|
39
|
-
@storage.clear
|
40
|
-
end
|
41
|
-
end # Null
|
42
|
-
end # Caches
|
43
|
-
end # Apollo
|
1
|
+
require File.join(File.dirname(__FILE__), '..', 'cache')
|
2
|
+
|
3
|
+
module Apollo
|
4
|
+
module Caches
|
5
|
+
class Memory < Cache
|
6
|
+
@storage = nil
|
7
|
+
|
8
|
+
def initialize
|
9
|
+
@storage = {}
|
10
|
+
end
|
11
|
+
|
12
|
+
# Get value associated with key from cache
|
13
|
+
def get(key, *args)
|
14
|
+
@storage[key]
|
15
|
+
|
16
|
+
# Not found, Create, cache and return
|
17
|
+
res = yield args
|
18
|
+
return res
|
19
|
+
end
|
20
|
+
|
21
|
+
# Set value associated with key
|
22
|
+
# Return cached value
|
23
|
+
def set(key, value)
|
24
|
+
@storage[key] = value
|
25
|
+
end
|
26
|
+
|
27
|
+
# Check if cache contains specified key
|
28
|
+
def contains(key)
|
29
|
+
@storage.has_key?(key)
|
30
|
+
end
|
31
|
+
|
32
|
+
# Invalidate key/value pair
|
33
|
+
def invalidate(key)
|
34
|
+
@storage.delete(key)
|
35
|
+
end
|
36
|
+
|
37
|
+
# Clear cache
|
38
|
+
def clear
|
39
|
+
@storage.clear
|
40
|
+
end
|
41
|
+
end # Null
|
42
|
+
end # Caches
|
43
|
+
end # Apollo
|
@@ -1,30 +1,30 @@
|
|
1
|
-
require File.join(File.dirname(__FILE__), '..', 'cache')
|
2
|
-
|
3
|
-
module Apollo
|
4
|
-
module Caches
|
5
|
-
class Null < Cache
|
6
|
-
# Get value associated with key from cache
|
7
|
-
def get(key, *args)
|
8
|
-
# Not found, Create, cache and return
|
9
|
-
res = yield args
|
10
|
-
return res
|
11
|
-
end
|
12
|
-
|
13
|
-
# Set value associated with key
|
14
|
-
# Return cached value
|
15
|
-
def set(key, value)
|
16
|
-
return value
|
17
|
-
end
|
18
|
-
|
19
|
-
# Check if cache contains specified key
|
20
|
-
def contains(key)
|
21
|
-
return false
|
22
|
-
end
|
23
|
-
|
24
|
-
# Invalidate key/value pair
|
25
|
-
def invalidate(key)
|
26
|
-
return true
|
27
|
-
end
|
28
|
-
end # Null
|
29
|
-
end # Caches
|
30
|
-
end # Apollo
|
1
|
+
require File.join(File.dirname(__FILE__), '..', 'cache')
|
2
|
+
|
3
|
+
module Apollo
|
4
|
+
module Caches
|
5
|
+
class Null < Cache
|
6
|
+
# Get value associated with key from cache
|
7
|
+
def get(key, *args)
|
8
|
+
# Not found, Create, cache and return
|
9
|
+
res = yield args
|
10
|
+
return res
|
11
|
+
end
|
12
|
+
|
13
|
+
# Set value associated with key
|
14
|
+
# Return cached value
|
15
|
+
def set(key, value)
|
16
|
+
return value
|
17
|
+
end
|
18
|
+
|
19
|
+
# Check if cache contains specified key
|
20
|
+
def contains(key)
|
21
|
+
return false
|
22
|
+
end
|
23
|
+
|
24
|
+
# Invalidate key/value pair
|
25
|
+
def invalidate(key)
|
26
|
+
return true
|
27
|
+
end
|
28
|
+
end # Null
|
29
|
+
end # Caches
|
30
|
+
end # Apollo
|
@@ -1,128 +1,155 @@
|
|
1
|
-
require "open-uri"
|
2
|
-
require "nokogiri"
|
3
|
-
|
4
|
-
module Apollo
|
5
|
-
module Crawlers
|
6
|
-
class Crawler
|
7
|
-
@backlog = nil
|
8
|
-
|
9
|
-
def initialize
|
10
|
-
@backlog = []
|
11
|
-
end
|
12
|
-
|
13
|
-
# Name of the crawler
|
14
|
-
def name
|
15
|
-
return "Crawler Base"
|
16
|
-
end
|
17
|
-
|
18
|
-
def url
|
19
|
-
return nil
|
20
|
-
end
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
if(url.
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
# TODO:
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
1
|
+
require "open-uri"
|
2
|
+
require "nokogiri"
|
3
|
+
|
4
|
+
module Apollo
|
5
|
+
module Crawlers
|
6
|
+
class Crawler
|
7
|
+
@backlog = nil
|
8
|
+
|
9
|
+
def initialize
|
10
|
+
@backlog = []
|
11
|
+
end
|
12
|
+
|
13
|
+
# Name of the crawler
|
14
|
+
def name
|
15
|
+
return "Crawler Base"
|
16
|
+
end
|
17
|
+
|
18
|
+
def url
|
19
|
+
return nil
|
20
|
+
end
|
21
|
+
|
22
|
+
def self.try_get_url(root, url)
|
23
|
+
begin
|
24
|
+
return URI.join(root, url)
|
25
|
+
rescue
|
26
|
+
return nil
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
# - (0) Figure out URL
|
31
|
+
# - (1) Extract Data
|
32
|
+
# - (2) Extract Links
|
33
|
+
# - (3) Go to (0) eventually
|
34
|
+
def etl(url=nil, &block)
|
35
|
+
# Look for passed URL use default instead and fail if it is not valid
|
36
|
+
if(url.nil? || url.empty?)
|
37
|
+
url = self.url
|
38
|
+
end
|
39
|
+
|
40
|
+
if(url.nil?)
|
41
|
+
return nil
|
42
|
+
end
|
43
|
+
|
44
|
+
if(url.kind_of?(Array))
|
45
|
+
@backlog.concat(url)
|
46
|
+
else
|
47
|
+
@backlog << url
|
48
|
+
end
|
49
|
+
|
50
|
+
res = []
|
51
|
+
# TODO: Respect limit of documents/urls processed
|
52
|
+
while(@backlog.empty? == false)
|
53
|
+
url = @backlog.shift
|
54
|
+
|
55
|
+
# puts "Processing '#{url}'"
|
56
|
+
doc = self.process_url(url)
|
57
|
+
res << doc
|
58
|
+
|
59
|
+
# TODO: Use log4r and log it only on info level
|
60
|
+
# TODO: Add some async/callback signal for document processed
|
61
|
+
yield res
|
62
|
+
|
63
|
+
if(!doc.nil? && !doc.empty?)
|
64
|
+
doc[:links].each do |link|
|
65
|
+
url = link[:link].to_s
|
66
|
+
# TODO: Use log4r and log it only on info level
|
67
|
+
#puts url
|
68
|
+
|
69
|
+
# TODO: Check if it is unique
|
70
|
+
@backlog << url
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
return res
|
75
|
+
end
|
76
|
+
|
77
|
+
def process_url(url)
|
78
|
+
# Try fetch document
|
79
|
+
doc = self.fetch_document(url)
|
80
|
+
if(doc.nil?)
|
81
|
+
return nil
|
82
|
+
end
|
83
|
+
|
84
|
+
# Try extract data from document
|
85
|
+
data = self.extract_data(doc)
|
86
|
+
|
87
|
+
# Try extract links for another documents
|
88
|
+
links = self.extract_links(doc)
|
89
|
+
puts links.inspect
|
90
|
+
|
91
|
+
# Format ETL result
|
92
|
+
res = {
|
93
|
+
:crawler => self.class.name,
|
94
|
+
:title => doc.title,
|
95
|
+
:data => data,
|
96
|
+
:links => links
|
97
|
+
}
|
98
|
+
|
99
|
+
return res
|
100
|
+
end
|
101
|
+
|
102
|
+
# Fetch document
|
103
|
+
def fetch_document(url)
|
104
|
+
# TODO: Refactor following idiom
|
105
|
+
if(url == nil)
|
106
|
+
url = self.url
|
107
|
+
end
|
108
|
+
|
109
|
+
if(url.nil?)
|
110
|
+
return nil
|
111
|
+
end
|
112
|
+
|
113
|
+
# TODO: Use some (custom-made) low-level HTTTP Protocol cache - just for sure
|
114
|
+
cache = Apollo::Caches::Factory.instance.construct
|
115
|
+
raw = cache.get(url) do
|
116
|
+
max_attempts = 3
|
117
|
+
attempt_no = 0
|
118
|
+
success = false
|
119
|
+
|
120
|
+
res = nil
|
121
|
+
while(attempt_no < max_attempts && success == false) do
|
122
|
+
begin
|
123
|
+
res = open(url).read
|
124
|
+
success = true
|
125
|
+
rescue Exception => e
|
126
|
+
puts "EXCEPTION: Unable to fetch '#{url}', reason: '#{e.to_s}'"
|
127
|
+
sleep 1
|
128
|
+
|
129
|
+
attempt_no = attempt_no + 1
|
130
|
+
success = false
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
res
|
135
|
+
end
|
136
|
+
|
137
|
+
# TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
|
138
|
+
doc = Nokogiri::HTML(raw)
|
139
|
+
return doc
|
140
|
+
end
|
141
|
+
|
142
|
+
# Extracts data from document
|
143
|
+
def extract_data(doc)
|
144
|
+
res = []
|
145
|
+
return res
|
146
|
+
end
|
147
|
+
|
148
|
+
# Extract links to another documents from this document
|
149
|
+
def extract_links(doc)
|
150
|
+
res = []
|
151
|
+
return res
|
152
|
+
end
|
153
|
+
end
|
154
|
+
end
|
128
155
|
end
|