apollo-crawler 0.1.5 → 0.1.6

Sign up to get free protection for your applications and to get access to all the features.
Files changed (30) hide show
  1. checksums.yaml +8 -8
  2. data/bin/apollo-crawler +12 -410
  3. data/lib/apollo_crawler.rb +31 -20
  4. data/lib/apollo_crawler/{cache.rb → cache/cache_base.rb} +37 -34
  5. data/lib/apollo_crawler/cache/factory.rb +35 -0
  6. data/lib/apollo_crawler/{caches → cache}/filesystem_cache.rb +37 -34
  7. data/lib/apollo_crawler/cache/memcached_cache.rb +51 -0
  8. data/lib/apollo_crawler/{caches → cache}/memory_cache.rb +46 -43
  9. data/lib/apollo_crawler/{caches → cache}/null_cache.rb +33 -30
  10. data/lib/apollo_crawler/config.rb +53 -0
  11. data/lib/apollo_crawler/{crawler.rb → crawler/crawler_base.rb} +157 -155
  12. data/lib/apollo_crawler/{crawler_template.rb → crawler/crawler_template.rb} +24 -24
  13. data/lib/apollo_crawler/{crawlers → crawler}/google_com/google.rb +40 -40
  14. data/lib/apollo_crawler/{crawlers → crawler}/slashdot_org/slashdot.rb +40 -40
  15. data/lib/apollo_crawler/{crawlers → crawler}/stackoverflow_com/stackoverflow.rb +44 -44
  16. data/lib/apollo_crawler/{crawlers → crawler}/xkcd_com/xkcd.rb +35 -35
  17. data/lib/apollo_crawler/{crawlers → crawler}/ycombinator_com/hacker_news.rb +44 -44
  18. data/lib/apollo_crawler/fetcher/fetcher_base.rb +6 -0
  19. data/lib/apollo_crawler/fetcher/simple_fetcher.rb +8 -0
  20. data/lib/apollo_crawler/formatter/formatter_base.rb +6 -0
  21. data/lib/apollo_crawler/{formatters → formatter}/formatter_json.rb +17 -17
  22. data/lib/apollo_crawler/{formatters → formatter}/formatter_plain.rb +17 -17
  23. data/lib/apollo_crawler/{formatters → formatter}/formatter_table.rb +35 -35
  24. data/lib/apollo_crawler/lib.rb +28 -0
  25. data/lib/apollo_crawler/program.rb +406 -0
  26. data/lib/apollo_crawler/store/store_base.rb +6 -0
  27. data/lib/apollo_crawler/version.rb +2 -2
  28. metadata +52 -17
  29. data/lib/apollo_crawler/caches/factory.rb +0 -30
  30. data/lib/apollo_crawler/formatter.rb +0 -6
@@ -1,34 +1,37 @@
1
- module Apollo
2
- module Caches
3
- class Cache
4
- # Get value associated with key from cache
5
- def get(key, *args)
6
-
7
- # Not found, Create, cache and return
8
- res = yield args
9
- return res
10
- end
11
-
12
- # Set value associated with key
13
- # Return cached value
14
- def set(key, value)
15
- return value
16
- end
17
-
18
- # Check if cache contains specified key
19
- def contains(key)
20
- return false
21
- end
22
-
23
- # Invalidate key/value pair
24
- def invalidate(key)
25
- return true
26
- end
27
-
28
- # Clear cache
29
- def clear
30
- return
31
- end
32
- end # Cache
33
- end # Caches
34
- end # Apollo
1
+ module Apollo
2
+ module Cache
3
+ class CacheBase
4
+ # Get value associated with key from cache
5
+ def get(key, *args)
6
+
7
+ # Not found, Create, cache and return
8
+ if block_given?
9
+ res = yield args
10
+ end
11
+
12
+ return res
13
+ end
14
+
15
+ # Set value associated with key
16
+ # Return cached value
17
+ def set(key, value)
18
+ return value
19
+ end
20
+
21
+ # Check if cache contains specified key
22
+ def contains(key)
23
+ return false
24
+ end
25
+
26
+ # Invalidate key/value pair
27
+ def invalidate(key)
28
+ return true
29
+ end
30
+
31
+ # Clear cache
32
+ def clear
33
+ return
34
+ end
35
+ end # CacheBase
36
+ end # Cache
37
+ end # Apollo
@@ -0,0 +1,35 @@
1
+ # Global config file
2
+ require File.join(File.dirname(__FILE__), '..', 'config')
3
+
4
+ # Cache instance base class
5
+ require File.join(File.dirname(__FILE__), 'cache_base')
6
+
7
+ # Factory uses singleton pattern
8
+ require 'singleton'
9
+
10
+ module Apollo
11
+ module Cache
12
+ class Factory
13
+ include Singleton
14
+
15
+ def initialize
16
+ @cache = nil
17
+ end
18
+
19
+ def self.construct()
20
+ self.singleton.construct()
21
+ end
22
+
23
+ def construct()
24
+ if(@cache.nil? == false)
25
+ return @cache
26
+ end
27
+
28
+ res = RbConfig::CACHE_CLASS.new
29
+
30
+ @cache = res
31
+ return res
32
+ end
33
+ end # Factory
34
+ end # Cache
35
+ end # Apollo
@@ -1,34 +1,37 @@
1
- require File.join(File.dirname(__FILE__), '..', 'cache')
2
-
3
- module Apollo
4
- module Caches
5
- class Filesystem < Cache
6
- def initialize
7
- # puts "This if Filesystem cache"
8
- end
9
-
10
- # Get value associated with key from cache
11
- def get(key, *args)
12
- # Not found, Create, cache and return
13
- res = yield args
14
- return res
15
- end
16
-
17
- # Set value associated with key
18
- # Return cached value
19
- def set(key, value)
20
- return value
21
- end
22
-
23
- # Check if cache contains specified key
24
- def contains(key)
25
- return false
26
- end
27
-
28
- # Invalidate key/value pair
29
- def invalidate(key)
30
- return true
31
- end
32
- end # Filesystem
33
- end # Caches
34
- end # Apollo
1
+ require File.join(File.dirname(__FILE__), 'cache_base')
2
+
3
+ module Apollo
4
+ module Cache
5
+ class Filesystem < CacheBase
6
+ def initialize
7
+ # puts "This if Filesystem cache"
8
+ end
9
+
10
+ # Get value associated with key from cache
11
+ def get(key, *args)
12
+ # Not found, Create, cache and return
13
+ if block_given?
14
+ res = yield args
15
+ end
16
+
17
+ return res
18
+ end
19
+
20
+ # Set value associated with key
21
+ # Return cached value
22
+ def set(key, value)
23
+ return value
24
+ end
25
+
26
+ # Check if cache contains specified key
27
+ def contains(key)
28
+ return false
29
+ end
30
+
31
+ # Invalidate key/value pair
32
+ def invalidate(key)
33
+ return true
34
+ end
35
+ end # Filesystem
36
+ end # Cache
37
+ end # Apollo
@@ -0,0 +1,51 @@
1
+ require File.join(File.dirname(__FILE__), 'cache_base')
2
+
3
+ require 'dalli'
4
+
5
+ module Apollo
6
+ module Cache
7
+ class Memcached < CacheBase
8
+ @cache = nil
9
+
10
+ def initialize
11
+ @cache = Dalli::Client.new()
12
+ end
13
+
14
+ # Get value associated with key from cache
15
+ def get(key, *args)
16
+ res = @cache.get(key)
17
+
18
+ # Not found, Create, cache and return
19
+ if res.nil? && block_given?
20
+ res = yield args
21
+
22
+ self.set(key, res)
23
+ end
24
+
25
+ return res
26
+ end
27
+
28
+ # Set value associated with key
29
+ # Return cached value
30
+ def set(key, value)
31
+ @cache.set(key, value)
32
+ return key
33
+ end
34
+
35
+ # Check if cache contains specified key
36
+ def contains(key)
37
+ # TODO: Implement
38
+ end
39
+
40
+ # Invalidate key/value pair
41
+ def invalidate(key)
42
+ # TODO: Implement
43
+ end
44
+
45
+ # Clear cache
46
+ def clear
47
+ # TODO: Implement
48
+ end
49
+ end # Null
50
+ end # Cache
51
+ end # Apollo
@@ -1,43 +1,46 @@
1
- require File.join(File.dirname(__FILE__), '..', 'cache')
2
-
3
- module Apollo
4
- module Caches
5
- class Memory < Cache
6
- @storage = nil
7
-
8
- def initialize
9
- @storage = {}
10
- end
11
-
12
- # Get value associated with key from cache
13
- def get(key, *args)
14
- @storage[key]
15
-
16
- # Not found, Create, cache and return
17
- res = yield args
18
- return res
19
- end
20
-
21
- # Set value associated with key
22
- # Return cached value
23
- def set(key, value)
24
- @storage[key] = value
25
- end
26
-
27
- # Check if cache contains specified key
28
- def contains(key)
29
- @storage.has_key?(key)
30
- end
31
-
32
- # Invalidate key/value pair
33
- def invalidate(key)
34
- @storage.delete(key)
35
- end
36
-
37
- # Clear cache
38
- def clear
39
- @storage.clear
40
- end
41
- end # Null
42
- end # Caches
43
- end # Apollo
1
+ require File.join(File.dirname(__FILE__), 'cache_base')
2
+
3
+ module Apollo
4
+ module Cache
5
+ class Memory < CacheBase
6
+ @cache = nil
7
+
8
+ def initialize
9
+ @cache = {}
10
+ end
11
+
12
+ # Get value associated with key from cache
13
+ def get(key, *args)
14
+ res = @cache[key]
15
+
16
+ # Not found, Create, cache and return
17
+ if res.nil? && block_given?
18
+ res = yield args
19
+ end
20
+
21
+ return res
22
+ end
23
+
24
+ # Set value associated with key
25
+ # Return cached value
26
+ def set(key, value)
27
+ @cache[key] = value
28
+ end
29
+
30
+ # Check if cache contains specified key
31
+ def contains(key)
32
+ @cache.has_key?(key)
33
+ end
34
+
35
+ # Invalidate key/value pair
36
+ def invalidate(key)
37
+ @cache.delete(key)
38
+ end
39
+
40
+ # Clear cache
41
+ def clear
42
+ @cache.clear
43
+ end
44
+ end # Null
45
+ end # Cache
46
+ end # Apollo
@@ -1,30 +1,33 @@
1
- require File.join(File.dirname(__FILE__), '..', 'cache')
2
-
3
- module Apollo
4
- module Caches
5
- class Null < Cache
6
- # Get value associated with key from cache
7
- def get(key, *args)
8
- # Not found, Create, cache and return
9
- res = yield args
10
- return res
11
- end
12
-
13
- # Set value associated with key
14
- # Return cached value
15
- def set(key, value)
16
- return value
17
- end
18
-
19
- # Check if cache contains specified key
20
- def contains(key)
21
- return false
22
- end
23
-
24
- # Invalidate key/value pair
25
- def invalidate(key)
26
- return true
27
- end
28
- end # Null
29
- end # Caches
30
- end # Apollo
1
+ require File.join(File.dirname(__FILE__), 'cache_base')
2
+
3
+ module Apollo
4
+ module Cache
5
+ class Null < CacheBase
6
+ # Get value associated with key from cache
7
+ def get(key, *args)
8
+ # Not found, Create, cache and return
9
+ if block_given?
10
+ res = yield args
11
+ end
12
+
13
+ return res
14
+ end
15
+
16
+ # Set value associated with key
17
+ # Return cached value
18
+ def set(key, value)
19
+ return value
20
+ end
21
+
22
+ # Check if cache contains specified key
23
+ def contains(key)
24
+ return false
25
+ end
26
+
27
+ # Invalidate key/value pair
28
+ def invalidate(key)
29
+ return true
30
+ end
31
+ end # Null
32
+ end # Cache
33
+ end # Apollo
@@ -0,0 +1,53 @@
1
+ # Caches
2
+ require File.join(File.dirname(__FILE__), 'lib')
3
+
4
+ module RbConfig
5
+ ############################################################
6
+ # Caches - caches implementations
7
+ ############################################################
8
+ CACHES_DIR = File.join(File.dirname(__FILE__), "caches")
9
+
10
+
11
+
12
+ ############################################################
13
+ # Cache implementation used for chaching pages retreived
14
+ ############################################################
15
+ #
16
+ # Filesystem backend
17
+ # CACHE_CLASS = Apollo::Cache::Filesystem
18
+ #
19
+ # Memcached - expects localhost:11211
20
+ # CACHE_CLASS = Apollo::Cache::Memcached
21
+ #
22
+ # Pure naive ruby in-memory implementation
23
+ # CACHE_CLASS = Apollo::Cache::Memory
24
+ #
25
+ # Null caching - no caching at all
26
+ # CACHE_CLASS = Apollo::Cache::Null
27
+
28
+ # Used caching mechanism by default
29
+ CACHE_CLASS = Apollo::Cache::Memcached
30
+
31
+
32
+
33
+ ############################################################
34
+ # Crawlers - Built-in out-of box working crawlers
35
+ ############################################################
36
+ CRAWLERS_DIR = File.join(File.dirname(__FILE__), "crawlers")
37
+
38
+ # Template used for generated crawlers
39
+ CRAWLER_TEMPLATE_NAME = "crawler_template.rb"
40
+
41
+ # Path of template
42
+ CRAWLER_TEMPLATE_PATH = File.join(File.dirname(__FILE__), "crawler_template.rb")
43
+
44
+
45
+
46
+ ############################################################
47
+ # Formatters - used for formatting crawled documents results
48
+ ############################################################
49
+ FORMATTERS_DIR = File.join(File.dirname(__FILE__), "formatters")
50
+
51
+ # Default formatter if no other specified
52
+ DEFAULT_FORMATTER = Apollo::Formatter::Json
53
+ end # Config
@@ -1,155 +1,157 @@
1
- require "open-uri"
2
- require "nokogiri"
3
-
4
- module Apollo
5
- module Crawlers
6
- class Crawler
7
- @backlog = nil
8
-
9
- def initialize
10
- @backlog = []
11
- end
12
-
13
- # Name of the crawler
14
- def name
15
- return "Crawler Base"
16
- end
17
-
18
- def url
19
- return nil
20
- end
21
-
22
- def self.try_get_url(root, url)
23
- begin
24
- return URI.join(root, url)
25
- rescue
26
- return nil
27
- end
28
- end
29
-
30
- # - (0) Figure out URL
31
- # - (1) Extract Data
32
- # - (2) Extract Links
33
- # - (3) Go to (0) eventually
34
- def etl(url=nil, &block)
35
- # Look for passed URL use default instead and fail if it is not valid
36
- if(url.nil? || url.empty?)
37
- url = self.url
38
- end
39
-
40
- if(url.nil?)
41
- return nil
42
- end
43
-
44
- if(url.kind_of?(Array))
45
- @backlog.concat(url)
46
- else
47
- @backlog << url
48
- end
49
-
50
- res = []
51
- # TODO: Respect limit of documents/urls processed
52
- while(@backlog.empty? == false)
53
- url = @backlog.shift
54
-
55
- # puts "Processing '#{url}'"
56
- doc = self.process_url(url)
57
- res << doc
58
-
59
- # TODO: Use log4r and log it only on info level
60
- # TODO: Add some async/callback signal for document processed
61
- yield res
62
-
63
- if(!doc.nil? && !doc.empty?)
64
- doc[:links].each do |link|
65
- url = link[:link].to_s
66
- # TODO: Use log4r and log it only on info level
67
- #puts url
68
-
69
- # TODO: Check if it is unique
70
- @backlog << url
71
- end
72
- end
73
- end
74
- return res
75
- end
76
-
77
- def process_url(url)
78
- # Try fetch document
79
- doc = self.fetch_document(url)
80
- if(doc.nil?)
81
- return nil
82
- end
83
-
84
- # Try extract data from document
85
- data = self.extract_data(doc)
86
-
87
- # Try extract links for another documents
88
- links = self.extract_links(doc)
89
- puts links.inspect
90
-
91
- # Format ETL result
92
- res = {
93
- :crawler => self.class.name,
94
- :title => doc.title,
95
- :data => data,
96
- :links => links
97
- }
98
-
99
- return res
100
- end
101
-
102
- # Fetch document
103
- def fetch_document(url)
104
- # TODO: Refactor following idiom
105
- if(url == nil)
106
- url = self.url
107
- end
108
-
109
- if(url.nil?)
110
- return nil
111
- end
112
-
113
- # TODO: Use some (custom-made) low-level HTTTP Protocol cache - just for sure
114
- cache = Apollo::Caches::Factory.instance.construct
115
- raw = cache.get(url) do
116
- max_attempts = 3
117
- attempt_no = 0
118
- success = false
119
-
120
- res = nil
121
- while(attempt_no < max_attempts && success == false) do
122
- begin
123
- res = open(url).read
124
- success = true
125
- rescue Exception => e
126
- puts "EXCEPTION: Unable to fetch '#{url}', reason: '#{e.to_s}'"
127
- sleep 1
128
-
129
- attempt_no = attempt_no + 1
130
- success = false
131
- end
132
- end
133
-
134
- res
135
- end
136
-
137
- # TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
138
- doc = Nokogiri::HTML(raw)
139
- return doc
140
- end
141
-
142
- # Extracts data from document
143
- def extract_data(doc)
144
- res = []
145
- return res
146
- end
147
-
148
- # Extract links to another documents from this document
149
- def extract_links(doc)
150
- res = []
151
- return res
152
- end
153
- end
154
- end
155
- end
1
+ require "open-uri"
2
+ require "nokogiri"
3
+
4
+ module Apollo
5
+ module Crawler
6
+ class CrawlerBase
7
+ @backlog = nil
8
+
9
+ def initialize
10
+ @backlog = []
11
+ end
12
+
13
+ # Name of the crawler
14
+ def name
15
+ return "Crawler Base"
16
+ end
17
+
18
+ def url
19
+ return nil
20
+ end
21
+
22
+ def self.try_get_url(root, url)
23
+ begin
24
+ return URI.join(root, url)
25
+ rescue
26
+ return nil
27
+ end
28
+ end
29
+
30
+ # - (0) Figure out URL
31
+ # - (1) Extract Data
32
+ # - (2) Extract Links
33
+ # - (3) Go to (0) eventually
34
+ def etl(url=nil, &block)
35
+ # Look for passed URL use default instead and fail if it is not valid
36
+ if(url.nil? || url.empty?)
37
+ url = self.url
38
+ end
39
+
40
+ if(url.nil?)
41
+ return nil
42
+ end
43
+
44
+ if(url.kind_of?(Array))
45
+ @backlog.concat(url)
46
+ else
47
+ @backlog << url
48
+ end
49
+
50
+ res = []
51
+ # TODO: Respect limit of documents/urls processed
52
+ while(@backlog.empty? == false)
53
+ url = @backlog.shift
54
+
55
+ # puts "Processing '#{url}'"
56
+ doc = self.process_url(url)
57
+ res << doc
58
+
59
+ # TODO: Use log4r and log it only on info level
60
+ # TODO: Add some async/callback signal for document processed
61
+ if block_given?
62
+ yield res
63
+ end
64
+
65
+ if(!doc.nil? && !doc.empty?)
66
+ doc[:links].each do |link|
67
+ url = link[:link].to_s
68
+ # TODO: Use log4r and log it only on info level
69
+ #puts url
70
+
71
+ # TODO: Check if it is unique
72
+ @backlog << url
73
+ end
74
+ end
75
+ end
76
+ return res
77
+ end
78
+
79
+ def process_url(url)
80
+ # Try fetch document
81
+ doc = self.fetch_document(url)
82
+ if(doc.nil?)
83
+ return nil
84
+ end
85
+
86
+ # Try extract data from document
87
+ data = self.extract_data(doc)
88
+
89
+ # Try extract links for another documents
90
+ links = self.extract_links(doc)
91
+ puts links.inspect
92
+
93
+ # Format ETL result
94
+ res = {
95
+ :crawler => self.class.name,
96
+ :title => doc.title,
97
+ :data => data,
98
+ :links => links
99
+ }
100
+
101
+ return res
102
+ end
103
+
104
+ # Fetch document
105
+ def fetch_document(url)
106
+ # TODO: Refactor following idiom
107
+ if(url == nil)
108
+ url = self.url
109
+ end
110
+
111
+ if(url.nil?)
112
+ return nil
113
+ end
114
+
115
+ # TODO: Use some (custom-made) low-level HTTTP Protocol cache - just for sure
116
+ cache = Apollo::Cache::Factory.instance.construct
117
+ raw = cache.get(url) do
118
+ max_attempts = 3
119
+ attempt_no = 0
120
+ success = false
121
+
122
+ res = nil
123
+ while(attempt_no < max_attempts && success == false) do
124
+ begin
125
+ res = open(url).read
126
+ success = true
127
+ rescue Exception => e
128
+ puts "EXCEPTION: Unable to fetch '#{url}', reason: '#{e.to_s}'"
129
+ sleep 1
130
+
131
+ attempt_no = attempt_no + 1
132
+ success = false
133
+ end
134
+ end
135
+
136
+ res
137
+ end
138
+
139
+ # TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
140
+ doc = Nokogiri::HTML(raw)
141
+ return doc
142
+ end
143
+
144
+ # Extracts data from document
145
+ def extract_data(doc)
146
+ res = []
147
+ return res
148
+ end
149
+
150
+ # Extract links to another documents from this document
151
+ def extract_links(doc)
152
+ res = []
153
+ return res
154
+ end
155
+ end # CrawlerBase
156
+ end # Crawler
157
+ end # Apollo