apollo-crawler 0.0.48 → 0.0.49

Sign up to get free protection for your applications and to get access to all the features.
data/bin/apollo-crawler CHANGED
@@ -24,6 +24,7 @@ require File.join(File.dirname(__FILE__), '..', 'lib', 'apollo_crawler', 'versio
24
24
 
25
25
  module Apollo
26
26
  class CrawlerProgram
27
+ @@CACHES_DIR = File.join(File.dirname(__FILE__), "..", "lib", "apollo_crawler", "caches")
27
28
  @@CRAWLERS_DIR = File.join(File.dirname(__FILE__), "..", "lib", "apollo_crawler", "crawlers")
28
29
  @@FORMATTERS_DIR = File.join(File.dirname(__FILE__), "..", "lib", "apollo_crawler", "formatters")
29
30
  @@CRAWLER_TEMPLATE_NAME = "crawler_template.rb"
@@ -33,12 +34,14 @@ module Apollo
33
34
  # OptionParser.
34
35
  @options = nil
35
36
  @optparser = nil
37
+ @caches = nil
36
38
  @crawlers = nil
37
39
  @formatters = nil
38
40
  @formatter = nil
39
41
 
40
42
  # Initializer - Constructor
41
43
  def initialize
44
+ @caches = {}
42
45
  @crawlers = {}
43
46
  @formatters = {}
44
47
  end
@@ -48,6 +51,9 @@ module Apollo
48
51
  @options = {}
49
52
  @options[:verbose] = false
50
53
  @options[:version] = false
54
+ @options[:cache_dirs] = [
55
+ @@CACHES_DIR
56
+ ]
51
57
  @options[:crawler_dirs] = [
52
58
  @@CRAWLERS_DIR
53
59
  ]
@@ -130,10 +136,10 @@ module Apollo
130
136
  end
131
137
  end
132
138
 
133
- # Register formatters
134
- def register_formatters(dir)
139
+ # Register caches
140
+ def register_cache(dir)
135
141
  if(@options[:verbose])
136
- puts "Registering formatters - '#{dir}'"
142
+ puts "Registering caches - '#{dir}'"
137
143
  end
138
144
 
139
145
  files = File.join(dir, "**", "*.rb")
@@ -141,24 +147,25 @@ module Apollo
141
147
  require file
142
148
  end
143
149
 
144
- tmp = Apollo::Formatters.constants.select { |c|
145
- Class === Apollo::Formatters.const_get(c)
150
+ tmp = Apollo::Caches.constants.select { |c|
151
+ Class === Apollo::Caches.const_get(c)
146
152
  }
147
153
 
148
154
  tmp.each do |x|
149
- klass = Object.const_get('Apollo').const_get('Formatters').const_get(x)
150
- @formatters.merge!({ x.downcase.to_s => klass})
155
+ klass = Object.const_get('Apollo').const_get('Caches').const_get(x)
156
+ @caches.merge!({ x.downcase.to_s => klass})
151
157
  end
152
158
 
153
159
  if(@options[:verbose])
154
- @formatters.each do |formatter, klass|
155
- name = klass.new.class.name
160
+ @caches.each do |cache, klass|
161
+ name = klass
156
162
 
157
- if name == "Apollo::Formatters::Formatter"
163
+ # klass.ancestors.include?(Apollo::Caches::Cache)
164
+ if name == "Apollo::Caches::Cache"
158
165
  next
159
166
  end
160
167
 
161
- puts "Registered formatter '#{formatter}' -> '#{name}'"
168
+ puts "Registered cache '#{cache}' -> '#{name}'"
162
169
  end
163
170
  end
164
171
  end
@@ -196,6 +203,39 @@ module Apollo
196
203
  end
197
204
  end
198
205
 
206
+ # Register formatters
207
+ def register_formatters(dir)
208
+ if(@options[:verbose])
209
+ puts "Registering formatters - '#{dir}'"
210
+ end
211
+
212
+ files = File.join(dir, "**", "*.rb")
213
+ Dir.glob(files).each do |file|
214
+ require file
215
+ end
216
+
217
+ tmp = Apollo::Formatters.constants.select { |c|
218
+ Class === Apollo::Formatters.const_get(c)
219
+ }
220
+
221
+ tmp.each do |x|
222
+ klass = Object.const_get('Apollo').const_get('Formatters').const_get(x)
223
+ @formatters.merge!({ x.downcase.to_s => klass})
224
+ end
225
+
226
+ if(@options[:verbose])
227
+ @formatters.each do |formatter, klass|
228
+ name = klass.new.class.name
229
+
230
+ if name == "Apollo::Formatters::Formatter"
231
+ next
232
+ end
233
+
234
+ puts "Registered formatter '#{formatter}' -> '#{name}'"
235
+ end
236
+ end
237
+ end
238
+
199
239
  def generate_crawler(name, url = nil, matcher = nil)
200
240
  name = name.titleize.gsub(" ", "")
201
241
 
@@ -262,6 +302,11 @@ module Apollo
262
302
  exit
263
303
  end
264
304
 
305
+ # Register caches which can be used
306
+ @options[:cache_dirs].each do |dir|
307
+ register_cache(dir)
308
+ end
309
+
265
310
  # Register sites which can be crawled
266
311
  @options[:crawler_dirs].each do |dir|
267
312
  register_crawlers(dir)
@@ -278,7 +323,6 @@ module Apollo
278
323
  formatter_name = @options[:formatter]
279
324
  end
280
325
 
281
-
282
326
  # Look for specified formatter
283
327
  f = @formatters.select { |k, v|
284
328
  k.downcase == formatter_name.downcase
@@ -308,6 +352,8 @@ module Apollo
308
352
  return
309
353
  end
310
354
 
355
+
356
+
311
357
  crawlers = []
312
358
  if(ARGV.length > 0)
313
359
  crawlers << ARGV.shift
@@ -1,6 +1,13 @@
1
+ # Main
2
+ require 'apollo_crawler/cache'
1
3
  require 'apollo_crawler/crawler'
2
4
  require 'apollo_crawler/formatter'
3
5
 
6
+ # Caches
7
+ require 'apollo_crawler/caches/factory'
8
+ require 'apollo_crawler/caches/memory_cache'
9
+ require 'apollo_crawler/caches/null_cache'
10
+
4
11
  # Crawlers
5
12
  require 'apollo_crawler/crawlers/alexa_com/alexa'
6
13
  require 'apollo_crawler/crawlers/firmy_cz/firmy'
@@ -0,0 +1,34 @@
1
+ module Apollo
2
+ module Caches
3
+ class Cache
4
+ # Get value associated with key from cache
5
+ def get(key, *args)
6
+
7
+ # Not found, Create, cache and return
8
+ res = yield args
9
+ return res
10
+ end
11
+
12
+ # Set value associated with key
13
+ # Return cached value
14
+ def set(key, value)
15
+ return value
16
+ end
17
+
18
+ # Check if cache contains specified key
19
+ def contains(key)
20
+ return false
21
+ end
22
+
23
+ # Invalidate key/value pair
24
+ def invalidate(key)
25
+ return true
26
+ end
27
+
28
+ # Clear cache
29
+ def clear
30
+ return
31
+ end
32
+ end # Cache
33
+ end # Caches
34
+ end # Apollo
@@ -0,0 +1,18 @@
1
+ require File.join(File.dirname(__FILE__), '..', 'cache')
2
+ require 'singleton'
3
+
4
+ module Apollo
5
+ module Caches
6
+ class Factory
7
+ include Singleton
8
+
9
+ def self.construct()
10
+ self.singleton.construct()
11
+ end
12
+
13
+ def construct()
14
+ Memory.new()
15
+ end
16
+ end # Factory
17
+ end # Caches
18
+ end # Apollo
@@ -0,0 +1,30 @@
1
+ require File.join(File.dirname(__FILE__), '..', 'cache')
2
+
3
+ module Apollo
4
+ module Caches
5
+ class Filesystem < Cache
6
+ # Get value associated with key from cache
7
+ def get(key, *args)
8
+ # Not found, Create, cache and return
9
+ res = yield args
10
+ return res
11
+ end
12
+
13
+ # Set value associated with key
14
+ # Return cached value
15
+ def set(key, value)
16
+ return value
17
+ end
18
+
19
+ # Check if cache contains specified key
20
+ def contains(key)
21
+ return false
22
+ end
23
+
24
+ # Invalidate key/value pair
25
+ def invalidate(key)
26
+ return true
27
+ end
28
+ end # Filesystem
29
+ end # Caches
30
+ end # Apollo
@@ -0,0 +1,43 @@
1
+ require File.join(File.dirname(__FILE__), '..', 'cache')
2
+
3
+ module Apollo
4
+ module Caches
5
+ class Memory < Cache
6
+ @storage = nil
7
+
8
+ def initialize
9
+ @storage = {}
10
+ end
11
+
12
+ # Get value associated with key from cache
13
+ def get(key, *args)
14
+ @storage[key]
15
+
16
+ # Not found, Create, cache and return
17
+ res = yield args
18
+ return res
19
+ end
20
+
21
+ # Set value associated with key
22
+ # Return cached value
23
+ def set(key, value)
24
+ @storage[key] = value
25
+ end
26
+
27
+ # Check if cache contains specified key
28
+ def contains(key)
29
+ @storage.has_key?(key)
30
+ end
31
+
32
+ # Invalidate key/value pair
33
+ def invalidate(key)
34
+ @storage.delete(key)
35
+ end
36
+
37
+ # Clear cache
38
+ def clear
39
+ @storage.clear
40
+ end
41
+ end # Null
42
+ end # Caches
43
+ end # Apollo
@@ -0,0 +1,30 @@
1
+ require File.join(File.dirname(__FILE__), '..', 'cache')
2
+
3
+ module Apollo
4
+ module Caches
5
+ class Null < Cache
6
+ # Get value associated with key from cache
7
+ def get(key, *args)
8
+ # Not found, Create, cache and return
9
+ res = yield args
10
+ return res
11
+ end
12
+
13
+ # Set value associated with key
14
+ # Return cached value
15
+ def set(key, value)
16
+ return value
17
+ end
18
+
19
+ # Check if cache contains specified key
20
+ def contains(key)
21
+ return false
22
+ end
23
+
24
+ # Invalidate key/value pair
25
+ def invalidate(key)
26
+ return true
27
+ end
28
+ end # Null
29
+ end # Caches
30
+ end # Apollo
@@ -40,11 +40,27 @@ module Apollo
40
40
  end
41
41
 
42
42
  res = []
43
+ # TODO: Respect limit of documents/urls processed
43
44
  while(@backlog.empty? == false)
44
45
  url = @backlog.shift
45
46
 
46
47
  # puts "Processing '#{url}'"
47
- res << self.process_url(url)
48
+ doc = self.process_url(url)
49
+ res << doc
50
+
51
+ # TODO: Use log4r and log it only on info level
52
+ puts doc.inspect
53
+
54
+ if(!doc.nil? && !doc.empty?)
55
+ doc[:links].each do |link|
56
+ url = link[:link].to_s
57
+ # TODO: Use log4r and log it only on info level
58
+ #puts url
59
+
60
+ # TODO: Check if it is unique
61
+ @backlog << url
62
+ end
63
+ end
48
64
  end
49
65
  return res
50
66
  end
@@ -62,17 +78,22 @@ module Apollo
62
78
  # Try extract links for another documents
63
79
  links = self.extract_links(doc)
64
80
 
65
- # Return ETL result
66
- return {
81
+ # Format ETL result
82
+ res = {
67
83
  :crawler => self.class.name,
68
84
  :title => doc.title,
69
85
  :data => data,
70
86
  :links => links
71
87
  }
88
+
89
+ # TODO: Add some async/callback signal for document processed
90
+
91
+ return res
72
92
  end
73
93
 
74
94
  # Fetch document
75
95
  def fetch_document(url)
96
+ # TODO: Refactor following idiom
76
97
  if(url == nil)
77
98
  url = self.url
78
99
  end
@@ -81,7 +102,11 @@ module Apollo
81
102
  return nil
82
103
  end
83
104
 
84
- raw = open(url).read
105
+ # TODO: Use some (custom-made) low-level HTTTP Protocol cache - just for sure
106
+ cache = Apollo::Caches::Factory.instance.construct
107
+ raw = cache.get(url) do
108
+ open(url).read
109
+ end
85
110
 
86
111
  # TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
87
112
  doc = Nokogiri::HTML(raw)
@@ -19,6 +19,6 @@ module Apollo
19
19
  }
20
20
  }
21
21
  end
22
- end
22
+ end # CRAWLER_CLASS_NAME
23
23
  end # Crawlers
24
24
  end # Apollo
@@ -5,7 +5,7 @@ module Apollo
5
5
  class Slashdot < Crawler
6
6
  @@MATCHER_ITEM = "//article/header/h2/span/a"
7
7
 
8
- def name
8
+ def name()
9
9
  return "Slashdot"
10
10
  end
11
11
 
@@ -5,8 +5,8 @@ module Apollo
5
5
  class StackOverflow < Crawler
6
6
  @@MATCHER_ITEM = "//div[@class = 'summary']/h3/a"
7
7
 
8
- def name
9
- return "StackOverflow"
8
+ def name()
9
+ return "Stackoverflow"
10
10
  end
11
11
 
12
12
  def url()
@@ -17,10 +17,19 @@ module Apollo
17
17
  res = doc.xpath(@@MATCHER_ITEM).map { |node|
18
18
  {
19
19
  :text => node['title'],
20
- :link => URI.join(self.url, node['src'])
20
+ :link => URI.join(self.url, node['src']),
21
21
  }
22
22
  }
23
23
  end
24
+
25
+ def extract_links(doc)
26
+ res = doc.xpath("//ul[@class = 'comicNav']/li/a[@accesskey = 'p']").map { |node|
27
+ {
28
+ :link => URI.join(self.url, node['href']),
29
+ }
30
+ }
31
+ res.uniq
32
+ end
24
33
  end
25
34
  end # Crawlers
26
35
  end # Apollo
@@ -5,7 +5,7 @@ module Apollo
5
5
  class HackerNews < Crawler
6
6
  @@MATCHER_ITEM = "//td[@class = 'title']/a"
7
7
 
8
- def name
8
+ def name()
9
9
  return "Hacker News"
10
10
  end
11
11
 
@@ -1,6 +1,6 @@
1
1
  module Apollo
2
2
  module Formatters
3
3
  class Formatter
4
- end
4
+ end # Formatter
5
5
  end # Formatters
6
6
  end # Apollo
@@ -24,8 +24,7 @@ module Apollo
24
24
 
25
25
  rows << data
26
26
  end
27
-
28
-
27
+
29
28
  table = Terminal::Table.new :headings => headings, :rows => rows
30
29
  return table
31
30
  end
@@ -1,5 +1,5 @@
1
1
  module Apollo
2
2
  module Crawler
3
- VERSION = '0.0.48'
3
+ VERSION = '0.0.49'
4
4
  end # Crawler
5
5
  end # Apollo
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: apollo-crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.48
4
+ version: 0.0.49
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -230,6 +230,10 @@ files:
230
230
  - ./lib/apollo_crawler/formatters/formatter_json.rb
231
231
  - ./lib/apollo_crawler/formatters/formatter_table.rb
232
232
  - ./lib/apollo_crawler/version.rb
233
+ - ./lib/apollo_crawler/caches/factory.rb
234
+ - ./lib/apollo_crawler/caches/null_cache.rb
235
+ - ./lib/apollo_crawler/caches/memory_cache.rb
236
+ - ./lib/apollo_crawler/caches/filesystem_cache.rb
233
237
  - ./lib/apollo_crawler/crawler_template.rb
234
238
  - ./lib/apollo_crawler/crawler.rb
235
239
  - ./lib/apollo_crawler/crawlers/stackoverflow_com/stackoverflow.rb
@@ -240,6 +244,7 @@ files:
240
244
  - ./lib/apollo_crawler/crawlers/alexa_com/alexa.rb
241
245
  - ./lib/apollo_crawler/crawlers/ycombinator_com/hacker_news.rb
242
246
  - ./lib/apollo_crawler/formatter.rb
247
+ - ./lib/apollo_crawler/cache.rb
243
248
  - ./lib/apollo_crawler.rb
244
249
  - bin/apollo-crawler
245
250
  homepage: https://github.com/korczis/apollo-crawler