apollo-crawler 0.1.5 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. checksums.yaml +8 -8
  2. data/bin/apollo-crawler +12 -410
  3. data/lib/apollo_crawler.rb +31 -20
  4. data/lib/apollo_crawler/{cache.rb → cache/cache_base.rb} +37 -34
  5. data/lib/apollo_crawler/cache/factory.rb +35 -0
  6. data/lib/apollo_crawler/{caches → cache}/filesystem_cache.rb +37 -34
  7. data/lib/apollo_crawler/cache/memcached_cache.rb +51 -0
  8. data/lib/apollo_crawler/{caches → cache}/memory_cache.rb +46 -43
  9. data/lib/apollo_crawler/{caches → cache}/null_cache.rb +33 -30
  10. data/lib/apollo_crawler/config.rb +53 -0
  11. data/lib/apollo_crawler/{crawler.rb → crawler/crawler_base.rb} +157 -155
  12. data/lib/apollo_crawler/{crawler_template.rb → crawler/crawler_template.rb} +24 -24
  13. data/lib/apollo_crawler/{crawlers → crawler}/google_com/google.rb +40 -40
  14. data/lib/apollo_crawler/{crawlers → crawler}/slashdot_org/slashdot.rb +40 -40
  15. data/lib/apollo_crawler/{crawlers → crawler}/stackoverflow_com/stackoverflow.rb +44 -44
  16. data/lib/apollo_crawler/{crawlers → crawler}/xkcd_com/xkcd.rb +35 -35
  17. data/lib/apollo_crawler/{crawlers → crawler}/ycombinator_com/hacker_news.rb +44 -44
  18. data/lib/apollo_crawler/fetcher/fetcher_base.rb +6 -0
  19. data/lib/apollo_crawler/fetcher/simple_fetcher.rb +8 -0
  20. data/lib/apollo_crawler/formatter/formatter_base.rb +6 -0
  21. data/lib/apollo_crawler/{formatters → formatter}/formatter_json.rb +17 -17
  22. data/lib/apollo_crawler/{formatters → formatter}/formatter_plain.rb +17 -17
  23. data/lib/apollo_crawler/{formatters → formatter}/formatter_table.rb +35 -35
  24. data/lib/apollo_crawler/lib.rb +28 -0
  25. data/lib/apollo_crawler/program.rb +406 -0
  26. data/lib/apollo_crawler/store/store_base.rb +6 -0
  27. data/lib/apollo_crawler/version.rb +2 -2
  28. metadata +52 -17
  29. data/lib/apollo_crawler/caches/factory.rb +0 -30
  30. data/lib/apollo_crawler/formatter.rb +0 -6
@@ -1,34 +1,37 @@
1
- module Apollo
2
- module Caches
3
- class Cache
4
- # Get value associated with key from cache
5
- def get(key, *args)
6
-
7
- # Not found, Create, cache and return
8
- res = yield args
9
- return res
10
- end
11
-
12
- # Set value associated with key
13
- # Return cached value
14
- def set(key, value)
15
- return value
16
- end
17
-
18
- # Check if cache contains specified key
19
- def contains(key)
20
- return false
21
- end
22
-
23
- # Invalidate key/value pair
24
- def invalidate(key)
25
- return true
26
- end
27
-
28
- # Clear cache
29
- def clear
30
- return
31
- end
32
- end # Cache
33
- end # Caches
34
- end # Apollo
1
+ module Apollo
2
+ module Cache
3
+ class CacheBase
4
+ # Get value associated with key from cache
5
+ def get(key, *args)
6
+
7
+ # Not found, Create, cache and return
8
+ if block_given?
9
+ res = yield args
10
+ end
11
+
12
+ return res
13
+ end
14
+
15
+ # Set value associated with key
16
+ # Return cached value
17
+ def set(key, value)
18
+ return value
19
+ end
20
+
21
+ # Check if cache contains specified key
22
+ def contains(key)
23
+ return false
24
+ end
25
+
26
+ # Invalidate key/value pair
27
+ def invalidate(key)
28
+ return true
29
+ end
30
+
31
+ # Clear cache
32
+ def clear
33
+ return
34
+ end
35
+ end # CacheBase
36
+ end # Cache
37
+ end # Apollo
@@ -0,0 +1,35 @@
1
+ # Global config file
2
+ require File.join(File.dirname(__FILE__), '..', 'config')
3
+
4
+ # Cache instance base class
5
+ require File.join(File.dirname(__FILE__), 'cache_base')
6
+
7
+ # Factory uses singleton pattern
8
+ require 'singleton'
9
+
10
+ module Apollo
11
+ module Cache
12
+ class Factory
13
+ include Singleton
14
+
15
+ def initialize
16
+ @cache = nil
17
+ end
18
+
19
+ def self.construct()
20
+ self.singleton.construct()
21
+ end
22
+
23
+ def construct()
24
+ if(@cache.nil? == false)
25
+ return @cache
26
+ end
27
+
28
+ res = RbConfig::CACHE_CLASS.new
29
+
30
+ @cache = res
31
+ return res
32
+ end
33
+ end # Factory
34
+ end # Cache
35
+ end # Apollo
@@ -1,34 +1,37 @@
1
- require File.join(File.dirname(__FILE__), '..', 'cache')
2
-
3
- module Apollo
4
- module Caches
5
- class Filesystem < Cache
6
- def initialize
7
- # puts "This if Filesystem cache"
8
- end
9
-
10
- # Get value associated with key from cache
11
- def get(key, *args)
12
- # Not found, Create, cache and return
13
- res = yield args
14
- return res
15
- end
16
-
17
- # Set value associated with key
18
- # Return cached value
19
- def set(key, value)
20
- return value
21
- end
22
-
23
- # Check if cache contains specified key
24
- def contains(key)
25
- return false
26
- end
27
-
28
- # Invalidate key/value pair
29
- def invalidate(key)
30
- return true
31
- end
32
- end # Filesystem
33
- end # Caches
34
- end # Apollo
1
+ require File.join(File.dirname(__FILE__), 'cache_base')
2
+
3
+ module Apollo
4
+ module Cache
5
+ class Filesystem < CacheBase
6
+ def initialize
7
+ # puts "This if Filesystem cache"
8
+ end
9
+
10
+ # Get value associated with key from cache
11
+ def get(key, *args)
12
+ # Not found, Create, cache and return
13
+ if block_given?
14
+ res = yield args
15
+ end
16
+
17
+ return res
18
+ end
19
+
20
+ # Set value associated with key
21
+ # Return cached value
22
+ def set(key, value)
23
+ return value
24
+ end
25
+
26
+ # Check if cache contains specified key
27
+ def contains(key)
28
+ return false
29
+ end
30
+
31
+ # Invalidate key/value pair
32
+ def invalidate(key)
33
+ return true
34
+ end
35
+ end # Filesystem
36
+ end # Cache
37
+ end # Apollo
@@ -0,0 +1,51 @@
1
+ require File.join(File.dirname(__FILE__), 'cache_base')
2
+
3
+ require 'dalli'
4
+
5
+ module Apollo
6
+ module Cache
7
+ class Memcached < CacheBase
8
+ @cache = nil
9
+
10
+ def initialize
11
+ @cache = Dalli::Client.new()
12
+ end
13
+
14
+ # Get value associated with key from cache
15
+ def get(key, *args)
16
+ res = @cache.get(key)
17
+
18
+ # Not found, Create, cache and return
19
+ if res.nil? && block_given?
20
+ res = yield args
21
+
22
+ self.set(key, res)
23
+ end
24
+
25
+ return res
26
+ end
27
+
28
+ # Set value associated with key
29
+ # Return cached value
30
+ def set(key, value)
31
+ @cache.set(key, value)
32
+ return key
33
+ end
34
+
35
+ # Check if cache contains specified key
36
+ def contains(key)
37
+ # TODO: Implement
38
+ end
39
+
40
+ # Invalidate key/value pair
41
+ def invalidate(key)
42
+ # TODO: Implement
43
+ end
44
+
45
+ # Clear cache
46
+ def clear
47
+ # TODO: Implement
48
+ end
49
+ end # Null
50
+ end # Cache
51
+ end # Apollo
@@ -1,43 +1,46 @@
1
- require File.join(File.dirname(__FILE__), '..', 'cache')
2
-
3
- module Apollo
4
- module Caches
5
- class Memory < Cache
6
- @storage = nil
7
-
8
- def initialize
9
- @storage = {}
10
- end
11
-
12
- # Get value associated with key from cache
13
- def get(key, *args)
14
- @storage[key]
15
-
16
- # Not found, Create, cache and return
17
- res = yield args
18
- return res
19
- end
20
-
21
- # Set value associated with key
22
- # Return cached value
23
- def set(key, value)
24
- @storage[key] = value
25
- end
26
-
27
- # Check if cache contains specified key
28
- def contains(key)
29
- @storage.has_key?(key)
30
- end
31
-
32
- # Invalidate key/value pair
33
- def invalidate(key)
34
- @storage.delete(key)
35
- end
36
-
37
- # Clear cache
38
- def clear
39
- @storage.clear
40
- end
41
- end # Null
42
- end # Caches
43
- end # Apollo
1
+ require File.join(File.dirname(__FILE__), 'cache_base')
2
+
3
+ module Apollo
4
+ module Cache
5
+ class Memory < CacheBase
6
+ @cache = nil
7
+
8
+ def initialize
9
+ @cache = {}
10
+ end
11
+
12
+ # Get value associated with key from cache
13
+ def get(key, *args)
14
+ res = @cache[key]
15
+
16
+ # Not found, Create, cache and return
17
+ if res.nil? && block_given?
18
+ res = yield args
19
+ end
20
+
21
+ return res
22
+ end
23
+
24
+ # Set value associated with key
25
+ # Return cached value
26
+ def set(key, value)
27
+ @cache[key] = value
28
+ end
29
+
30
+ # Check if cache contains specified key
31
+ def contains(key)
32
+ @cache.has_key?(key)
33
+ end
34
+
35
+ # Invalidate key/value pair
36
+ def invalidate(key)
37
+ @cache.delete(key)
38
+ end
39
+
40
+ # Clear cache
41
+ def clear
42
+ @cache.clear
43
+ end
44
+ end # Null
45
+ end # Cache
46
+ end # Apollo
@@ -1,30 +1,33 @@
1
- require File.join(File.dirname(__FILE__), '..', 'cache')
2
-
3
- module Apollo
4
- module Caches
5
- class Null < Cache
6
- # Get value associated with key from cache
7
- def get(key, *args)
8
- # Not found, Create, cache and return
9
- res = yield args
10
- return res
11
- end
12
-
13
- # Set value associated with key
14
- # Return cached value
15
- def set(key, value)
16
- return value
17
- end
18
-
19
- # Check if cache contains specified key
20
- def contains(key)
21
- return false
22
- end
23
-
24
- # Invalidate key/value pair
25
- def invalidate(key)
26
- return true
27
- end
28
- end # Null
29
- end # Caches
30
- end # Apollo
1
+ require File.join(File.dirname(__FILE__), 'cache_base')
2
+
3
+ module Apollo
4
+ module Cache
5
+ class Null < CacheBase
6
+ # Get value associated with key from cache
7
+ def get(key, *args)
8
+ # Not found, Create, cache and return
9
+ if block_given?
10
+ res = yield args
11
+ end
12
+
13
+ return res
14
+ end
15
+
16
+ # Set value associated with key
17
+ # Return cached value
18
+ def set(key, value)
19
+ return value
20
+ end
21
+
22
+ # Check if cache contains specified key
23
+ def contains(key)
24
+ return false
25
+ end
26
+
27
+ # Invalidate key/value pair
28
+ def invalidate(key)
29
+ return true
30
+ end
31
+ end # Null
32
+ end # Cache
33
+ end # Apollo
@@ -0,0 +1,53 @@
1
+ # Caches
2
+ require File.join(File.dirname(__FILE__), 'lib')
3
+
4
+ module RbConfig
5
+ ############################################################
6
+ # Caches - caches implementations
7
+ ############################################################
8
+ CACHES_DIR = File.join(File.dirname(__FILE__), "caches")
9
+
10
+
11
+
12
+ ############################################################
13
+ # Cache implementation used for chaching pages retreived
14
+ ############################################################
15
+ #
16
+ # Filesystem backend
17
+ # CACHE_CLASS = Apollo::Cache::Filesystem
18
+ #
19
+ # Memcached - expects localhost:11211
20
+ # CACHE_CLASS = Apollo::Cache::Memcached
21
+ #
22
+ # Pure naive ruby in-memory implementation
23
+ # CACHE_CLASS = Apollo::Cache::Memory
24
+ #
25
+ # Null caching - no caching at all
26
+ # CACHE_CLASS = Apollo::Cache::Null
27
+
28
+ # Used caching mechanism by default
29
+ CACHE_CLASS = Apollo::Cache::Memcached
30
+
31
+
32
+
33
+ ############################################################
34
+ # Crawlers - Built-in out-of box working crawlers
35
+ ############################################################
36
+ CRAWLERS_DIR = File.join(File.dirname(__FILE__), "crawlers")
37
+
38
+ # Template used for generated crawlers
39
+ CRAWLER_TEMPLATE_NAME = "crawler_template.rb"
40
+
41
+ # Path of template
42
+ CRAWLER_TEMPLATE_PATH = File.join(File.dirname(__FILE__), "crawler_template.rb")
43
+
44
+
45
+
46
+ ############################################################
47
+ # Formatters - used for formatting crawled documents results
48
+ ############################################################
49
+ FORMATTERS_DIR = File.join(File.dirname(__FILE__), "formatters")
50
+
51
+ # Default formatter if no other specified
52
+ DEFAULT_FORMATTER = Apollo::Formatter::Json
53
+ end # Config
@@ -1,155 +1,157 @@
1
- require "open-uri"
2
- require "nokogiri"
3
-
4
- module Apollo
5
- module Crawlers
6
- class Crawler
7
- @backlog = nil
8
-
9
- def initialize
10
- @backlog = []
11
- end
12
-
13
- # Name of the crawler
14
- def name
15
- return "Crawler Base"
16
- end
17
-
18
- def url
19
- return nil
20
- end
21
-
22
- def self.try_get_url(root, url)
23
- begin
24
- return URI.join(root, url)
25
- rescue
26
- return nil
27
- end
28
- end
29
-
30
- # - (0) Figure out URL
31
- # - (1) Extract Data
32
- # - (2) Extract Links
33
- # - (3) Go to (0) eventually
34
- def etl(url=nil, &block)
35
- # Look for passed URL use default instead and fail if it is not valid
36
- if(url.nil? || url.empty?)
37
- url = self.url
38
- end
39
-
40
- if(url.nil?)
41
- return nil
42
- end
43
-
44
- if(url.kind_of?(Array))
45
- @backlog.concat(url)
46
- else
47
- @backlog << url
48
- end
49
-
50
- res = []
51
- # TODO: Respect limit of documents/urls processed
52
- while(@backlog.empty? == false)
53
- url = @backlog.shift
54
-
55
- # puts "Processing '#{url}'"
56
- doc = self.process_url(url)
57
- res << doc
58
-
59
- # TODO: Use log4r and log it only on info level
60
- # TODO: Add some async/callback signal for document processed
61
- yield res
62
-
63
- if(!doc.nil? && !doc.empty?)
64
- doc[:links].each do |link|
65
- url = link[:link].to_s
66
- # TODO: Use log4r and log it only on info level
67
- #puts url
68
-
69
- # TODO: Check if it is unique
70
- @backlog << url
71
- end
72
- end
73
- end
74
- return res
75
- end
76
-
77
- def process_url(url)
78
- # Try fetch document
79
- doc = self.fetch_document(url)
80
- if(doc.nil?)
81
- return nil
82
- end
83
-
84
- # Try extract data from document
85
- data = self.extract_data(doc)
86
-
87
- # Try extract links for another documents
88
- links = self.extract_links(doc)
89
- puts links.inspect
90
-
91
- # Format ETL result
92
- res = {
93
- :crawler => self.class.name,
94
- :title => doc.title,
95
- :data => data,
96
- :links => links
97
- }
98
-
99
- return res
100
- end
101
-
102
- # Fetch document
103
- def fetch_document(url)
104
- # TODO: Refactor following idiom
105
- if(url == nil)
106
- url = self.url
107
- end
108
-
109
- if(url.nil?)
110
- return nil
111
- end
112
-
113
- # TODO: Use some (custom-made) low-level HTTTP Protocol cache - just for sure
114
- cache = Apollo::Caches::Factory.instance.construct
115
- raw = cache.get(url) do
116
- max_attempts = 3
117
- attempt_no = 0
118
- success = false
119
-
120
- res = nil
121
- while(attempt_no < max_attempts && success == false) do
122
- begin
123
- res = open(url).read
124
- success = true
125
- rescue Exception => e
126
- puts "EXCEPTION: Unable to fetch '#{url}', reason: '#{e.to_s}'"
127
- sleep 1
128
-
129
- attempt_no = attempt_no + 1
130
- success = false
131
- end
132
- end
133
-
134
- res
135
- end
136
-
137
- # TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
138
- doc = Nokogiri::HTML(raw)
139
- return doc
140
- end
141
-
142
- # Extracts data from document
143
- def extract_data(doc)
144
- res = []
145
- return res
146
- end
147
-
148
- # Extract links to another documents from this document
149
- def extract_links(doc)
150
- res = []
151
- return res
152
- end
153
- end
154
- end
155
- end
1
+ require "open-uri"
2
+ require "nokogiri"
3
+
4
+ module Apollo
5
+ module Crawler
6
+ class CrawlerBase
7
+ @backlog = nil
8
+
9
+ def initialize
10
+ @backlog = []
11
+ end
12
+
13
+ # Name of the crawler
14
+ def name
15
+ return "Crawler Base"
16
+ end
17
+
18
+ def url
19
+ return nil
20
+ end
21
+
22
+ def self.try_get_url(root, url)
23
+ begin
24
+ return URI.join(root, url)
25
+ rescue
26
+ return nil
27
+ end
28
+ end
29
+
30
+ # - (0) Figure out URL
31
+ # - (1) Extract Data
32
+ # - (2) Extract Links
33
+ # - (3) Go to (0) eventually
34
+ def etl(url=nil, &block)
35
+ # Look for passed URL use default instead and fail if it is not valid
36
+ if(url.nil? || url.empty?)
37
+ url = self.url
38
+ end
39
+
40
+ if(url.nil?)
41
+ return nil
42
+ end
43
+
44
+ if(url.kind_of?(Array))
45
+ @backlog.concat(url)
46
+ else
47
+ @backlog << url
48
+ end
49
+
50
+ res = []
51
+ # TODO: Respect limit of documents/urls processed
52
+ while(@backlog.empty? == false)
53
+ url = @backlog.shift
54
+
55
+ # puts "Processing '#{url}'"
56
+ doc = self.process_url(url)
57
+ res << doc
58
+
59
+ # TODO: Use log4r and log it only on info level
60
+ # TODO: Add some async/callback signal for document processed
61
+ if block_given?
62
+ yield res
63
+ end
64
+
65
+ if(!doc.nil? && !doc.empty?)
66
+ doc[:links].each do |link|
67
+ url = link[:link].to_s
68
+ # TODO: Use log4r and log it only on info level
69
+ #puts url
70
+
71
+ # TODO: Check if it is unique
72
+ @backlog << url
73
+ end
74
+ end
75
+ end
76
+ return res
77
+ end
78
+
79
+ def process_url(url)
80
+ # Try fetch document
81
+ doc = self.fetch_document(url)
82
+ if(doc.nil?)
83
+ return nil
84
+ end
85
+
86
+ # Try extract data from document
87
+ data = self.extract_data(doc)
88
+
89
+ # Try extract links for another documents
90
+ links = self.extract_links(doc)
91
+ puts links.inspect
92
+
93
+ # Format ETL result
94
+ res = {
95
+ :crawler => self.class.name,
96
+ :title => doc.title,
97
+ :data => data,
98
+ :links => links
99
+ }
100
+
101
+ return res
102
+ end
103
+
104
+ # Fetch document
105
+ def fetch_document(url)
106
+ # TODO: Refactor following idiom
107
+ if(url == nil)
108
+ url = self.url
109
+ end
110
+
111
+ if(url.nil?)
112
+ return nil
113
+ end
114
+
115
+ # TODO: Use some (custom-made) low-level HTTTP Protocol cache - just for sure
116
+ cache = Apollo::Cache::Factory.instance.construct
117
+ raw = cache.get(url) do
118
+ max_attempts = 3
119
+ attempt_no = 0
120
+ success = false
121
+
122
+ res = nil
123
+ while(attempt_no < max_attempts && success == false) do
124
+ begin
125
+ res = open(url).read
126
+ success = true
127
+ rescue Exception => e
128
+ puts "EXCEPTION: Unable to fetch '#{url}', reason: '#{e.to_s}'"
129
+ sleep 1
130
+
131
+ attempt_no = attempt_no + 1
132
+ success = false
133
+ end
134
+ end
135
+
136
+ res
137
+ end
138
+
139
+ # TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
140
+ doc = Nokogiri::HTML(raw)
141
+ return doc
142
+ end
143
+
144
+ # Extracts data from document
145
+ def extract_data(doc)
146
+ res = []
147
+ return res
148
+ end
149
+
150
+ # Extract links to another documents from this document
151
+ def extract_links(doc)
152
+ res = []
153
+ return res
154
+ end
155
+ end # CrawlerBase
156
+ end # Crawler
157
+ end # Apollo