apollo-crawler 0.0.48 → 0.0.49

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/bin/apollo-crawler CHANGED
@@ -24,6 +24,7 @@ require File.join(File.dirname(__FILE__), '..', 'lib', 'apollo_crawler', 'versio
24
24
 
25
25
  module Apollo
26
26
  class CrawlerProgram
27
+ @@CACHES_DIR = File.join(File.dirname(__FILE__), "..", "lib", "apollo_crawler", "caches")
27
28
  @@CRAWLERS_DIR = File.join(File.dirname(__FILE__), "..", "lib", "apollo_crawler", "crawlers")
28
29
  @@FORMATTERS_DIR = File.join(File.dirname(__FILE__), "..", "lib", "apollo_crawler", "formatters")
29
30
  @@CRAWLER_TEMPLATE_NAME = "crawler_template.rb"
@@ -33,12 +34,14 @@ module Apollo
33
34
  # OptionParser.
34
35
  @options = nil
35
36
  @optparser = nil
37
+ @caches = nil
36
38
  @crawlers = nil
37
39
  @formatters = nil
38
40
  @formatter = nil
39
41
 
40
42
  # Initializer - Constructor
41
43
  def initialize
44
+ @caches = {}
42
45
  @crawlers = {}
43
46
  @formatters = {}
44
47
  end
@@ -48,6 +51,9 @@ module Apollo
48
51
  @options = {}
49
52
  @options[:verbose] = false
50
53
  @options[:version] = false
54
+ @options[:cache_dirs] = [
55
+ @@CACHES_DIR
56
+ ]
51
57
  @options[:crawler_dirs] = [
52
58
  @@CRAWLERS_DIR
53
59
  ]
@@ -130,10 +136,10 @@ module Apollo
130
136
  end
131
137
  end
132
138
 
133
- # Register formatters
134
- def register_formatters(dir)
139
+ # Register caches
140
+ def register_cache(dir)
135
141
  if(@options[:verbose])
136
- puts "Registering formatters - '#{dir}'"
142
+ puts "Registering caches - '#{dir}'"
137
143
  end
138
144
 
139
145
  files = File.join(dir, "**", "*.rb")
@@ -141,24 +147,25 @@ module Apollo
141
147
  require file
142
148
  end
143
149
 
144
- tmp = Apollo::Formatters.constants.select { |c|
145
- Class === Apollo::Formatters.const_get(c)
150
+ tmp = Apollo::Caches.constants.select { |c|
151
+ Class === Apollo::Caches.const_get(c)
146
152
  }
147
153
 
148
154
  tmp.each do |x|
149
- klass = Object.const_get('Apollo').const_get('Formatters').const_get(x)
150
- @formatters.merge!({ x.downcase.to_s => klass})
155
+ klass = Object.const_get('Apollo').const_get('Caches').const_get(x)
156
+ @caches.merge!({ x.downcase.to_s => klass})
151
157
  end
152
158
 
153
159
  if(@options[:verbose])
154
- @formatters.each do |formatter, klass|
155
- name = klass.new.class.name
160
+ @caches.each do |cache, klass|
161
+ name = klass
156
162
 
157
- if name == "Apollo::Formatters::Formatter"
163
+ # klass.ancestors.include?(Apollo::Caches::Cache)
164
+ if name == "Apollo::Caches::Cache"
158
165
  next
159
166
  end
160
167
 
161
- puts "Registered formatter '#{formatter}' -> '#{name}'"
168
+ puts "Registered cache '#{cache}' -> '#{name}'"
162
169
  end
163
170
  end
164
171
  end
@@ -196,6 +203,39 @@ module Apollo
196
203
  end
197
204
  end
198
205
 
206
+ # Register formatters
207
+ def register_formatters(dir)
208
+ if(@options[:verbose])
209
+ puts "Registering formatters - '#{dir}'"
210
+ end
211
+
212
+ files = File.join(dir, "**", "*.rb")
213
+ Dir.glob(files).each do |file|
214
+ require file
215
+ end
216
+
217
+ tmp = Apollo::Formatters.constants.select { |c|
218
+ Class === Apollo::Formatters.const_get(c)
219
+ }
220
+
221
+ tmp.each do |x|
222
+ klass = Object.const_get('Apollo').const_get('Formatters').const_get(x)
223
+ @formatters.merge!({ x.downcase.to_s => klass})
224
+ end
225
+
226
+ if(@options[:verbose])
227
+ @formatters.each do |formatter, klass|
228
+ name = klass.new.class.name
229
+
230
+ if name == "Apollo::Formatters::Formatter"
231
+ next
232
+ end
233
+
234
+ puts "Registered formatter '#{formatter}' -> '#{name}'"
235
+ end
236
+ end
237
+ end
238
+
199
239
  def generate_crawler(name, url = nil, matcher = nil)
200
240
  name = name.titleize.gsub(" ", "")
201
241
 
@@ -262,6 +302,11 @@ module Apollo
262
302
  exit
263
303
  end
264
304
 
305
+ # Register caches which can be used
306
+ @options[:cache_dirs].each do |dir|
307
+ register_cache(dir)
308
+ end
309
+
265
310
  # Register sites which can be crawled
266
311
  @options[:crawler_dirs].each do |dir|
267
312
  register_crawlers(dir)
@@ -278,7 +323,6 @@ module Apollo
278
323
  formatter_name = @options[:formatter]
279
324
  end
280
325
 
281
-
282
326
  # Look for specified formatter
283
327
  f = @formatters.select { |k, v|
284
328
  k.downcase == formatter_name.downcase
@@ -308,6 +352,8 @@ module Apollo
308
352
  return
309
353
  end
310
354
 
355
+
356
+
311
357
  crawlers = []
312
358
  if(ARGV.length > 0)
313
359
  crawlers << ARGV.shift
@@ -1,6 +1,13 @@
1
+ # Main
2
+ require 'apollo_crawler/cache'
1
3
  require 'apollo_crawler/crawler'
2
4
  require 'apollo_crawler/formatter'
3
5
 
6
+ # Caches
7
+ require 'apollo_crawler/caches/factory'
8
+ require 'apollo_crawler/caches/memory_cache'
9
+ require 'apollo_crawler/caches/null_cache'
10
+
4
11
  # Crawlers
5
12
  require 'apollo_crawler/crawlers/alexa_com/alexa'
6
13
  require 'apollo_crawler/crawlers/firmy_cz/firmy'
@@ -0,0 +1,34 @@
1
+ module Apollo
2
+ module Caches
3
+ class Cache
4
+ # Get value associated with key from cache
5
+ def get(key, *args)
6
+
7
+ # Not found, Create, cache and return
8
+ res = yield args
9
+ return res
10
+ end
11
+
12
+ # Set value associated with key
13
+ # Return cached value
14
+ def set(key, value)
15
+ return value
16
+ end
17
+
18
+ # Check if cache contains specified key
19
+ def contains(key)
20
+ return false
21
+ end
22
+
23
+ # Invalidate key/value pair
24
+ def invalidate(key)
25
+ return true
26
+ end
27
+
28
+ # Clear cache
29
+ def clear
30
+ return
31
+ end
32
+ end # Cache
33
+ end # Caches
34
+ end # Apollo
@@ -0,0 +1,18 @@
1
+ require File.join(File.dirname(__FILE__), '..', 'cache')
2
+ require 'singleton'
3
+
4
+ module Apollo
5
+ module Caches
6
+ class Factory
7
+ include Singleton
8
+
9
+ def self.construct()
10
+ self.singleton.construct()
11
+ end
12
+
13
+ def construct()
14
+ Memory.new()
15
+ end
16
+ end # Factory
17
+ end # Caches
18
+ end # Apollo
@@ -0,0 +1,30 @@
1
+ require File.join(File.dirname(__FILE__), '..', 'cache')
2
+
3
+ module Apollo
4
+ module Caches
5
+ class Filesystem < Cache
6
+ # Get value associated with key from cache
7
+ def get(key, *args)
8
+ # Not found, Create, cache and return
9
+ res = yield args
10
+ return res
11
+ end
12
+
13
+ # Set value associated with key
14
+ # Return cached value
15
+ def set(key, value)
16
+ return value
17
+ end
18
+
19
+ # Check if cache contains specified key
20
+ def contains(key)
21
+ return false
22
+ end
23
+
24
+ # Invalidate key/value pair
25
+ def invalidate(key)
26
+ return true
27
+ end
28
+ end # Filesystem
29
+ end # Caches
30
+ end # Apollo
@@ -0,0 +1,43 @@
1
+ require File.join(File.dirname(__FILE__), '..', 'cache')
2
+
3
+ module Apollo
4
+ module Caches
5
+ class Memory < Cache
6
+ @storage = nil
7
+
8
+ def initialize
9
+ @storage = {}
10
+ end
11
+
12
+ # Get value associated with key from cache
13
+ def get(key, *args)
14
+ @storage[key]
15
+
16
+ # Not found, Create, cache and return
17
+ res = yield args
18
+ return res
19
+ end
20
+
21
+ # Set value associated with key
22
+ # Return cached value
23
+ def set(key, value)
24
+ @storage[key] = value
25
+ end
26
+
27
+ # Check if cache contains specified key
28
+ def contains(key)
29
+ @storage.has_key?(key)
30
+ end
31
+
32
+ # Invalidate key/value pair
33
+ def invalidate(key)
34
+ @storage.delete(key)
35
+ end
36
+
37
+ # Clear cache
38
+ def clear
39
+ @storage.clear
40
+ end
41
+ end # Null
42
+ end # Caches
43
+ end # Apollo
@@ -0,0 +1,30 @@
1
+ require File.join(File.dirname(__FILE__), '..', 'cache')
2
+
3
+ module Apollo
4
+ module Caches
5
+ class Null < Cache
6
+ # Get value associated with key from cache
7
+ def get(key, *args)
8
+ # Not found, Create, cache and return
9
+ res = yield args
10
+ return res
11
+ end
12
+
13
+ # Set value associated with key
14
+ # Return cached value
15
+ def set(key, value)
16
+ return value
17
+ end
18
+
19
+ # Check if cache contains specified key
20
+ def contains(key)
21
+ return false
22
+ end
23
+
24
+ # Invalidate key/value pair
25
+ def invalidate(key)
26
+ return true
27
+ end
28
+ end # Null
29
+ end # Caches
30
+ end # Apollo
@@ -40,11 +40,27 @@ module Apollo
40
40
  end
41
41
 
42
42
  res = []
43
+ # TODO: Respect limit of documents/urls processed
43
44
  while(@backlog.empty? == false)
44
45
  url = @backlog.shift
45
46
 
46
47
  # puts "Processing '#{url}'"
47
- res << self.process_url(url)
48
+ doc = self.process_url(url)
49
+ res << doc
50
+
51
+ # TODO: Use log4r and log it only on info level
52
+ puts doc.inspect
53
+
54
+ if(!doc.nil? && !doc.empty?)
55
+ doc[:links].each do |link|
56
+ url = link[:link].to_s
57
+ # TODO: Use log4r and log it only on info level
58
+ #puts url
59
+
60
+ # TODO: Check if it is unique
61
+ @backlog << url
62
+ end
63
+ end
48
64
  end
49
65
  return res
50
66
  end
@@ -62,17 +78,22 @@ module Apollo
62
78
  # Try extract links for another documents
63
79
  links = self.extract_links(doc)
64
80
 
65
- # Return ETL result
66
- return {
81
+ # Format ETL result
82
+ res = {
67
83
  :crawler => self.class.name,
68
84
  :title => doc.title,
69
85
  :data => data,
70
86
  :links => links
71
87
  }
88
+
89
+ # TODO: Add some async/callback signal for document processed
90
+
91
+ return res
72
92
  end
73
93
 
74
94
  # Fetch document
75
95
  def fetch_document(url)
96
+ # TODO: Refactor following idiom
76
97
  if(url == nil)
77
98
  url = self.url
78
99
  end
@@ -81,7 +102,11 @@ module Apollo
81
102
  return nil
82
103
  end
83
104
 
84
- raw = open(url).read
105
+ # TODO: Use some (custom-made) low-level HTTTP Protocol cache - just for sure
106
+ cache = Apollo::Caches::Factory.instance.construct
107
+ raw = cache.get(url) do
108
+ open(url).read
109
+ end
85
110
 
86
111
  # TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
87
112
  doc = Nokogiri::HTML(raw)
@@ -19,6 +19,6 @@ module Apollo
19
19
  }
20
20
  }
21
21
  end
22
- end
22
+ end # CRAWLER_CLASS_NAME
23
23
  end # Crawlers
24
24
  end # Apollo
@@ -5,7 +5,7 @@ module Apollo
5
5
  class Slashdot < Crawler
6
6
  @@MATCHER_ITEM = "//article/header/h2/span/a"
7
7
 
8
- def name
8
+ def name()
9
9
  return "Slashdot"
10
10
  end
11
11
 
@@ -5,8 +5,8 @@ module Apollo
5
5
  class StackOverflow < Crawler
6
6
  @@MATCHER_ITEM = "//div[@class = 'summary']/h3/a"
7
7
 
8
- def name
9
- return "StackOverflow"
8
+ def name()
9
+ return "Stackoverflow"
10
10
  end
11
11
 
12
12
  def url()
@@ -17,10 +17,19 @@ module Apollo
17
17
  res = doc.xpath(@@MATCHER_ITEM).map { |node|
18
18
  {
19
19
  :text => node['title'],
20
- :link => URI.join(self.url, node['src'])
20
+ :link => URI.join(self.url, node['src']),
21
21
  }
22
22
  }
23
23
  end
24
+
25
+ def extract_links(doc)
26
+ res = doc.xpath("//ul[@class = 'comicNav']/li/a[@accesskey = 'p']").map { |node|
27
+ {
28
+ :link => URI.join(self.url, node['href']),
29
+ }
30
+ }
31
+ res.uniq
32
+ end
24
33
  end
25
34
  end # Crawlers
26
35
  end # Apollo
@@ -5,7 +5,7 @@ module Apollo
5
5
  class HackerNews < Crawler
6
6
  @@MATCHER_ITEM = "//td[@class = 'title']/a"
7
7
 
8
- def name
8
+ def name()
9
9
  return "Hacker News"
10
10
  end
11
11
 
@@ -1,6 +1,6 @@
1
1
  module Apollo
2
2
  module Formatters
3
3
  class Formatter
4
- end
4
+ end # Formatter
5
5
  end # Formatters
6
6
  end # Apollo
@@ -24,8 +24,7 @@ module Apollo
24
24
 
25
25
  rows << data
26
26
  end
27
-
28
-
27
+
29
28
  table = Terminal::Table.new :headings => headings, :rows => rows
30
29
  return table
31
30
  end
@@ -1,5 +1,5 @@
1
1
  module Apollo
2
2
  module Crawler
3
- VERSION = '0.0.48'
3
+ VERSION = '0.0.49'
4
4
  end # Crawler
5
5
  end # Apollo
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: apollo-crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.48
4
+ version: 0.0.49
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -230,6 +230,10 @@ files:
230
230
  - ./lib/apollo_crawler/formatters/formatter_json.rb
231
231
  - ./lib/apollo_crawler/formatters/formatter_table.rb
232
232
  - ./lib/apollo_crawler/version.rb
233
+ - ./lib/apollo_crawler/caches/factory.rb
234
+ - ./lib/apollo_crawler/caches/null_cache.rb
235
+ - ./lib/apollo_crawler/caches/memory_cache.rb
236
+ - ./lib/apollo_crawler/caches/filesystem_cache.rb
233
237
  - ./lib/apollo_crawler/crawler_template.rb
234
238
  - ./lib/apollo_crawler/crawler.rb
235
239
  - ./lib/apollo_crawler/crawlers/stackoverflow_com/stackoverflow.rb
@@ -240,6 +244,7 @@ files:
240
244
  - ./lib/apollo_crawler/crawlers/alexa_com/alexa.rb
241
245
  - ./lib/apollo_crawler/crawlers/ycombinator_com/hacker_news.rb
242
246
  - ./lib/apollo_crawler/formatter.rb
247
+ - ./lib/apollo_crawler/cache.rb
243
248
  - ./lib/apollo_crawler.rb
244
249
  - bin/apollo-crawler
245
250
  homepage: https://github.com/korczis/apollo-crawler