apollo-crawler 0.1.8 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. checksums.yaml +8 -8
  2. data/lib/apollo_crawler.rb +20 -13
  3. data/lib/apollo_crawler/cache/{cache_base.rb → base_cache.rb} +4 -4
  4. data/lib/apollo_crawler/cache/factory.rb +1 -1
  5. data/lib/apollo_crawler/cache/memcached_cache.rb +5 -5
  6. data/lib/apollo_crawler/cache/memory_cache.rb +5 -5
  7. data/lib/apollo_crawler/cache/null_cache.rb +3 -3
  8. data/lib/apollo_crawler/config.rb +25 -10
  9. data/lib/apollo_crawler/crawler/{crawler_base.rb → base_crawler.rb} +41 -22
  10. data/lib/apollo_crawler/crawler/{google_com/google.rb → google_crawler.rb} +7 -7
  11. data/lib/apollo_crawler/crawler/{ycombinator_com/hacker_news.rb → hacker_news_crawler.rb} +7 -7
  12. data/lib/apollo_crawler/crawler/{slashdot_org/slashdot.rb → slashdot_crawler.rb} +7 -7
  13. data/lib/apollo_crawler/crawler/{stackoverflow_com/stackoverflow.rb → stackoverflow_crawler.rb} +7 -7
  14. data/lib/apollo_crawler/crawler/{xkcd_com/xkcd.rb → xkcd_crawler.rb} +5 -5
  15. data/lib/apollo_crawler/fetcher/base_fetcher.rb +11 -0
  16. data/lib/apollo_crawler/fetcher/simple_fetcher.rb +12 -5
  17. data/lib/apollo_crawler/fetcher/smart_fetcher.rb +15 -0
  18. data/lib/apollo_crawler/formatter/base_formatter.rb +9 -0
  19. data/lib/apollo_crawler/formatter/{formatter_json.rb → json_formatter.rb} +5 -5
  20. data/lib/apollo_crawler/formatter/plain_formatter.rb +17 -0
  21. data/lib/apollo_crawler/formatter/{formatter_table.rb → table_formatter.rb} +5 -5
  22. data/lib/apollo_crawler/lib.rb +20 -13
  23. data/lib/apollo_crawler/logger/base_logger.rb +6 -0
  24. data/lib/apollo_crawler/logger/console_logger.rb +15 -0
  25. data/lib/apollo_crawler/program.rb +15 -8
  26. data/lib/apollo_crawler/store/base_store.rb +6 -0
  27. data/lib/apollo_crawler/version.rb +1 -1
  28. metadata +17 -16
  29. data/lib/apollo_crawler/cache/filesystem_cache.rb +0 -37
  30. data/lib/apollo_crawler/crawler/crawler_template.rb +0 -24
  31. data/lib/apollo_crawler/fetcher/fetcher_base.rb +0 -6
  32. data/lib/apollo_crawler/formatter/formatter_base.rb +0 -6
  33. data/lib/apollo_crawler/formatter/formatter_plain.rb +0 -17
  34. data/lib/apollo_crawler/store/store_base.rb +0 -6
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- YjEzODFmZTRlMzYyYzc1MjhkMzc4MzQ5ZDk4MWNmYWQ4NjhkNWZjNw==
4
+ Y2I3MDM1OWQ1NmU2ZDMzMTg0OGRhNTYzODc5Mzg4MDhhZTkxOWJlMQ==
5
5
  data.tar.gz: !binary |-
6
- YTc2ZTFhYmU3NjE1OGJjMTU1OTllN2JkY2ZjODBmOTdjNDgwMWVhMQ==
6
+ ZWY0N2M3ZjU5ZGNmZjgwMTdkNWI0Y2JhYmZjZmUwNDFjYTA5ZjAwOA==
7
7
  !binary "U0hBNTEy":
8
8
  metadata.gz: !binary |-
9
- OGI1YzJjZWU0MGY0MGVjN2MxYzczMGY4ODc4OWVhMTc0MTNlNTgwYjkwNzI4
10
- ZDIxYTBlMDJlMWEwMjMzYzBmOWUyYmQ5OWMwNGEwMWNlOGI5M2U1YjIxNjVi
11
- NDg2ZGM1NDBlNzI2ZTVlOTBjOGQ5NjljYTcwNmQ2NWZhY2E1OWI=
9
+ MjM1ZWUzMGUyYzBiYjdmNmFjMmE4OTU5MjA0NDE3YWJlZTE4OWQ0YTE5MWQx
10
+ ZDBmN2UzZjk3N2Y3NDYwOGNiMWFiY2JkY2I2ODJmYzFkYWU3YTYzYjI5YTI0
11
+ NDQxZDk4YzBlODBmNDg3MzRkMDU2OWY1ZmViNmUzYWFhZjZlNGU=
12
12
  data.tar.gz: !binary |-
13
- Mjk0MWJkNDc0ZmI1MWM0ZmRkZTA4Nzg5MmQzNjNjNGZiNTE2NTgwMzU4YzI5
14
- Nzc5YWUzZWVlODUwNjJjMTc3MzBlNWUxNDQ4YWEzYzZjYmM1MDdkYWQ0ZGIz
15
- NTRmMjQwMzEwZGM0NGJhODUxZjY3ZGQwODFhY2Y4M2I3ZDEzYTk=
13
+ MDIzY2RhMmFkNmI4MjBlYzkzNmZhOTA5NjM2NWUyM2YzODQwM2M0NGIzN2U1
14
+ OTgxNGYwYTdhMTIxZGVkYzNlYjE1M2U1N2NjMDdiN2I4MWNkMTMwMzRmMDMy
15
+ MWFiOWRhNGIyOTZhY2NmZDE2YTRjMzUxYjQyODU1NTAzNjQwM2M=
@@ -5,27 +5,34 @@
5
5
  require File.join(File.dirname(__FILE__), 'apollo_crawler/program')
6
6
 
7
7
  # Caches
8
- require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/cache_base')
8
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/base_cache')
9
9
  require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/factory')
10
- require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/filesystem_cache')
11
10
  require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/memcached_cache')
12
11
  require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/memory_cache')
13
12
  require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/null_cache')
14
13
 
15
14
  # Crawlers
16
- require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/crawler_base')
17
- require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/google_com/google')
18
- require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/slashdot_org/slashdot')
19
- require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/stackoverflow_com/stackoverflow')
20
- require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/xkcd_com/xkcd')
21
- require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/ycombinator_com/hacker_news')
15
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/base_crawler')
16
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/google_crawler')
17
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/hacker_news_crawler')
18
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/slashdot_crawler')
19
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/stackoverflow_crawler')
20
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/xkcd_crawler')
22
21
 
23
22
  # Fetchers
24
- require File.join(File.dirname(__FILE__), 'apollo_crawler/fetcher/fetcher_base')
23
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/fetcher/base_fetcher')
25
24
  require File.join(File.dirname(__FILE__), 'apollo_crawler/fetcher/simple_fetcher')
25
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/fetcher/smart_fetcher')
26
26
 
27
27
  # Formatters
28
- require File.join(File.dirname(__FILE__), 'apollo_crawler/formatter/formatter_base')
29
- require File.join(File.dirname(__FILE__), 'apollo_crawler/formatter/formatter_json')
30
- require File.join(File.dirname(__FILE__), 'apollo_crawler/formatter/formatter_plain')
31
- require File.join(File.dirname(__FILE__), 'apollo_crawler/formatter/formatter_table')
28
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/formatter/base_formatter')
29
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/formatter/json_formatter')
30
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/formatter/plain_formatter')
31
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/formatter/table_formatter')
32
+
33
+ # Loggers
34
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/logger/base_logger')
35
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/logger/console_logger')
36
+
37
+ # Stores
38
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/store/base_store')
@@ -1,6 +1,6 @@
1
1
  module Apollo
2
2
  module Cache
3
- class CacheBase
3
+ class BaseCache
4
4
  # Get value associated with key from cache
5
5
  def get(key, *args)
6
6
 
@@ -32,6 +32,6 @@ module Apollo
32
32
  def clear
33
33
  return
34
34
  end
35
- end # CacheBase
36
- end # Cache
37
- end # Apollo
35
+ end # class BaseCache
36
+ end # module Cache
37
+ end # module Apollo
@@ -2,7 +2,7 @@
2
2
  require File.join(File.dirname(__FILE__), '..', 'config')
3
3
 
4
4
  # Cache instance base class
5
- require File.join(File.dirname(__FILE__), 'cache_base')
5
+ require File.join(File.dirname(__FILE__), 'base_cache')
6
6
 
7
7
  # Factory uses singleton pattern
8
8
  require 'singleton'
@@ -1,10 +1,10 @@
1
- require File.join(File.dirname(__FILE__), 'cache_base')
1
+ require File.join(File.dirname(__FILE__), 'base_cache')
2
2
 
3
3
  require 'dalli'
4
4
 
5
5
  module Apollo
6
6
  module Cache
7
- class Memcached < CacheBase
7
+ class MemcachedCache < BaseCache
8
8
  @cache = nil
9
9
 
10
10
  def initialize
@@ -46,6 +46,6 @@ module Apollo
46
46
  def clear
47
47
  # TODO: Implement
48
48
  end
49
- end # Null
50
- end # Cache
51
- end # Apollo
49
+ end # class MemcachedCache
50
+ end # module Cache
51
+ end # module Apollo
@@ -1,8 +1,8 @@
1
- require File.join(File.dirname(__FILE__), 'cache_base')
1
+ require File.join(File.dirname(__FILE__), 'base_cache')
2
2
 
3
3
  module Apollo
4
4
  module Cache
5
- class Memory < CacheBase
5
+ class MemoryCache < BaseCache
6
6
  @cache = nil
7
7
 
8
8
  def initialize
@@ -41,6 +41,6 @@ module Apollo
41
41
  def clear
42
42
  @cache.clear
43
43
  end
44
- end # Null
45
- end # Cache
46
- end # Apollo
44
+ end # class MemoryCache
45
+ end # module Cache
46
+ end # module Apollo
@@ -1,8 +1,8 @@
1
- require File.join(File.dirname(__FILE__), 'cache_base')
1
+ require File.join(File.dirname(__FILE__), 'base_cache')
2
2
 
3
3
  module Apollo
4
4
  module Cache
5
- class Null < CacheBase
5
+ class NullCache < BaseCache
6
6
  # Get value associated with key from cache
7
7
  def get(key, *args)
8
8
  # Not found, Create, cache and return
@@ -28,6 +28,6 @@ module Apollo
28
28
  def invalidate(key)
29
29
  return true
30
30
  end
31
- end # Null
31
+ end # NullCache
32
32
  end # Cache
33
33
  end # Apollo
@@ -14,40 +14,55 @@ module RbConfig
14
14
  ############################################################
15
15
  #
16
16
  # Filesystem backend
17
- # CACHE_CLASS = Apollo::Cache::Filesystem
17
+ # CACHE_CLASS = Apollo::Cache::FilesystemCache
18
18
  #
19
19
  # Memcached - expects localhost:11211
20
- # CACHE_CLASS = Apollo::Cache::Memcached
20
+ # CACHE_CLASS = Apollo::Cache::MemcachedCache
21
21
  #
22
22
  # Pure naive ruby in-memory implementation
23
- # CACHE_CLASS = Apollo::Cache::Memory
23
+ # CACHE_CLASS = Apollo::Cache::MemoryCache
24
24
  #
25
25
  # Null caching - no caching at all
26
- # CACHE_CLASS = Apollo::Cache::Null
26
+ # CACHE_CLASS = Apollo::Cache::NullCache
27
27
 
28
28
  # Used caching mechanism by default
29
- CACHE_CLASS = Apollo::Cache::Memcached
29
+ CACHE_CLASS = Apollo::Cache::MemcachedCache
30
30
 
31
31
 
32
32
 
33
33
  ############################################################
34
34
  # Crawlers - Built-in out-of box working crawlers
35
35
  ############################################################
36
- CRAWLERS_DIR = File.join(File.dirname(__FILE__), "crawlers")
36
+ CRAWLERS_DIR = File.join(File.dirname(__FILE__), "crawler")
37
37
 
38
38
  # Template used for generated crawlers
39
- CRAWLER_TEMPLATE_NAME = "crawler_template.rb"
39
+ CRAWLER_TEMPLATE_NAME = "crawler_template.trb"
40
40
 
41
41
  # Path of template
42
- CRAWLER_TEMPLATE_PATH = File.join(File.dirname(__FILE__), "crawler_template.rb")
42
+ CRAWLER_TEMPLATE_PATH = File.join(CRAWLERS_DIR, CRAWLER_TEMPLATE_NAME)
43
43
 
44
44
 
45
45
 
46
+ ############################################################
47
+ # Fetchers - used for fetching documents
48
+ ############################################################
49
+ FETCHERS_DIR = File.join(File.dirname(__FILE__), "fetcher")
50
+
51
+ DEFAULT_FETCHER = Apollo::Fetcher::SmartFetcher
52
+
53
+
46
54
  ############################################################
47
55
  # Formatters - used for formatting crawled documents results
48
56
  ############################################################
49
- FORMATTERS_DIR = File.join(File.dirname(__FILE__), "formatters")
57
+ FORMATTERS_DIR = File.join(File.dirname(__FILE__), "formatter")
50
58
 
51
59
  # Default formatter if no other specified
52
- DEFAULT_FORMATTER = Apollo::Formatter::Json
60
+ DEFAULT_FORMATTER = Apollo::Formatter::JsonFormatter
61
+
62
+
63
+
64
+ ############################################################
65
+ # Loggers - used for formatting output messages
66
+ ############################################################
67
+ DEFAULT_LOGGER = Apollo::Logger::ConsoleLogger
53
68
  end # Config
@@ -3,13 +3,19 @@ require "nokogiri"
3
3
 
4
4
  module Apollo
5
5
  module Crawler
6
- class CrawlerBase
6
+ class BaseCrawler
7
+
8
+
7
9
  @backlog = nil
8
10
 
9
11
  def initialize
10
12
  @backlog = []
11
13
  end
12
14
 
15
+ def self.name_re()
16
+ return /crawler$/
17
+ end
18
+
13
19
  # Name of the crawler
14
20
  def name
15
21
  return "Crawler Base"
@@ -19,6 +25,10 @@ module Apollo
19
25
  return nil
20
26
  end
21
27
 
28
+ def self.fetch(url)
29
+ RbConfig::DEFAULT_FETCHER.fetch(url)
30
+ end
31
+
22
32
  def self.try_get_url(root, url)
23
33
  begin
24
34
  return URI.join(root, url)
@@ -28,7 +38,7 @@ module Apollo
28
38
  end
29
39
 
30
40
  def self.try_get_doc(root, url)
31
- doc = CrawlerBase.try_get_url(root, url)
41
+ doc = BaseCrawler.try_get_url(root, url)
32
42
 
33
43
  # TODO: Set experition header
34
44
  return {
@@ -47,16 +57,19 @@ module Apollo
47
57
  url = self.url
48
58
  end
49
59
 
60
+ # TODO: Be more agressive, use assert, it is clients responsibility!
50
61
  if(url.nil?)
51
62
  return nil
52
63
  end
53
64
 
65
+ # We support both - list of urls or single url
54
66
  if(url.kind_of?(Array))
55
67
  @backlog.concat(url)
56
68
  else
57
69
  @backlog << url
58
70
  end
59
71
 
72
+ # Counter of processed documents (pages)
60
73
  docs_processed = 0
61
74
 
62
75
  res = []
@@ -66,34 +79,40 @@ module Apollo
66
79
 
67
80
  # puts "Processing '#{url}'"
68
81
  doc = self.process_url(url)
69
- res << doc
70
82
 
71
- # TODO: Use log4r and log it only on info level
72
- # TODO: Add some async/callback signal for document processed
73
- if block_given?
74
- yield res
75
- end
83
+ # Increase counter of processed documents
84
+ docs_processed = docs_processed + 1
85
+
86
+ # Process document if was successfuly retreived
87
+ if(!doc.nil?)
88
+ # TODO: Use log4r and log it only on info level
89
+ if block_given?
90
+ yield doc
91
+ end
76
92
 
77
- if(!doc.nil? && !doc.empty?)
78
- doc[:links].each do |link|
79
- url = link[:link].to_s
80
- # TODO: Use log4r and log it only on info level
81
- #puts url
93
+ # Add document to queue of results
94
+ res << doc
82
95
 
83
- # TODO: Check if it is unique
84
- @backlog << url
96
+ # If
97
+ if(doc[:links].nil? == false)
98
+ doc[:links].each do |link|
99
+ url = link[:link].to_s
100
+ # TODO: Use log4r and log it only on info level
101
+ #puts url
102
+
103
+ # TODO: Check if it is unique
104
+ @backlog << url
105
+ end
85
106
  end
86
107
  end
87
108
 
88
- # Increase counter of processed documents
89
- docs_processed = docs_processed + 1
109
+ # Break if limit of documents to processed was reached
90
110
  break if opts[:doc_limit] && docs_processed >= opts[:doc_limit]
91
111
  end
92
112
  return res
93
113
  end
94
114
 
95
115
  def process_url(url)
96
- # Try fetch document
97
116
  doc = self.fetch_document(url)
98
117
  if(doc.nil?)
99
118
  return nil
@@ -140,7 +159,7 @@ module Apollo
140
159
  res = nil
141
160
  while(attempt_no < max_attempts && success == false) do
142
161
  begin
143
- res = open(url).read
162
+ res = BaseCrawler.fetch(url)
144
163
  success = true
145
164
  rescue Exception => e
146
165
  puts "EXCEPTION: Unable to fetch '#{url}', reason: '#{e.to_s}'"
@@ -170,6 +189,6 @@ module Apollo
170
189
  res = []
171
190
  return res
172
191
  end
173
- end # CrawlerBase
174
- end # Crawler
175
- end # Apollo
192
+ end # class BaseCrawler
193
+ end # module Crawler
194
+ end # module Apollo
@@ -1,8 +1,8 @@
1
- require File.join(File.dirname(__FILE__), '..', 'crawler_base')
1
+ require File.join(File.dirname(__FILE__), 'base_crawler')
2
2
 
3
3
  module Apollo
4
4
  module Crawler
5
- class Google < CrawlerBase
5
+ class GoogleCrawler < BaseCrawler
6
6
  @@MATCHER_ITEM = "//h3/a"
7
7
 
8
8
  def name()
@@ -15,7 +15,7 @@ module Apollo
15
15
 
16
16
  def extract_data(doc)
17
17
  res = doc.xpath(@@MATCHER_ITEM).map { | node |
18
- url = CrawlerBase.try_get_url(self.url, node['href'])
18
+ url = BaseCrawler.try_get_url(self.url, node['href'])
19
19
  next if url.nil?
20
20
 
21
21
  {
@@ -27,7 +27,7 @@ module Apollo
27
27
 
28
28
  def extract_links(doc)
29
29
  res = doc.xpath("(//td[@class = 'b']/a)[last()]").map { | node |
30
- res_doc = CrawlerBase.try_get_url(self.url, node['href'])
30
+ res_doc = BaseCrawler.try_get_url(self.url, node['href'])
31
31
  next if url.nil?
32
32
 
33
33
  {
@@ -35,6 +35,6 @@ module Apollo
35
35
  }
36
36
  }
37
37
  end
38
- end
39
- end # Crawler
40
- end # Apollo
38
+ end # class GoogleCrawler
39
+ end # module Crawler
40
+ end # module Apollo
@@ -1,8 +1,8 @@
1
- require File.join(File.dirname(__FILE__), '..', 'crawler_base')
1
+ require File.join(File.dirname(__FILE__), 'base_crawler')
2
2
 
3
3
  module Apollo
4
4
  module Crawler
5
- class HackerNews < CrawlerBase
5
+ class HackerNewsCrawler < BaseCrawler
6
6
  @@MATCHER_ITEM = "(//td[@class = 'title']/a)[not(position() > last() -1)]"
7
7
 
8
8
  def name()
@@ -15,7 +15,7 @@ module Apollo
15
15
 
16
16
  def extract_data(doc)
17
17
  res = doc.xpath(@@MATCHER_ITEM).map { |node|
18
- url = CrawlerBase.try_get_url(self.url, node['href'])
18
+ url = BaseCrawler.try_get_url(self.url, node['href'])
19
19
  next if url.nil?
20
20
 
21
21
  {
@@ -29,7 +29,7 @@ module Apollo
29
29
 
30
30
  def extract_links(doc)
31
31
  res = doc.xpath("(//a[@class = 'prevnextbutact'])").map { |node|
32
- url = CrawlerBase.try_get_url(self.url, node['href'])
32
+ url = BaseCrawler.try_get_url(self.url, node['href'])
33
33
  next if url.nil?
34
34
 
35
35
  {
@@ -39,6 +39,6 @@ module Apollo
39
39
 
40
40
  return res.uniq
41
41
  end
42
- end
43
- end # Crawler
44
- end # Apollo
42
+ end # class HackerNewsCrawler
43
+ end # module Crawler
44
+ end # module Apollo
@@ -1,8 +1,8 @@
1
- require File.join(File.dirname(__FILE__), '..', 'crawler_base')
1
+ require File.join(File.dirname(__FILE__), 'base_crawler')
2
2
 
3
3
  module Apollo
4
4
  module Crawler
5
- class Slashdot < CrawlerBase
5
+ class SlashdotCrawler < BaseCrawler
6
6
  @@MATCHER_ITEM = "//article/header/h2/span/a"
7
7
 
8
8
  def name()
@@ -15,7 +15,7 @@ module Apollo
15
15
 
16
16
  def extract_data(doc)
17
17
  res = doc.xpath(@@MATCHER_ITEM).map { | node |
18
- url = CrawlerBase.try_get_url(self.url, node['href'])
18
+ url = BaseCrawler.try_get_url(self.url, node['href'])
19
19
  next if url.nil?
20
20
 
21
21
  {
@@ -27,7 +27,7 @@ module Apollo
27
27
 
28
28
  def extract_links(doc)
29
29
  res = doc.xpath(@@MATCHER_ITEM).map { | node |
30
- url = CrawlerBase.try_get_url(self.url, node['href'])
30
+ url = BaseCrawler.try_get_url(self.url, node['href'])
31
31
  next if url.nil?
32
32
 
33
33
  {
@@ -35,6 +35,6 @@ module Apollo
35
35
  }
36
36
  }
37
37
  end
38
- end
39
- end # Crawler
40
- end # Apollo
38
+ end # class SlashdotCrawler
39
+ end # module Crawler
40
+ end # module Apollo
@@ -1,8 +1,8 @@
1
- require File.join(File.dirname(__FILE__), '..', 'crawler_base')
1
+ require File.join(File.dirname(__FILE__), 'base_crawler')
2
2
 
3
3
  module Apollo
4
4
  module Crawler
5
- class StackOverflow < CrawlerBase
5
+ class StackoverflowCrawler < BaseCrawler
6
6
  @@MATCHER_ITEM = "//div[@class = 'summary']/h3/a"
7
7
 
8
8
  def name()
@@ -15,7 +15,7 @@ module Apollo
15
15
 
16
16
  def extract_data(doc)
17
17
  res = doc.xpath(@@MATCHER_ITEM).map { |node|
18
- url = CrawlerBase.try_get_url(self.url, node['href'])
18
+ url = BaseCrawler.try_get_url(self.url, node['href'])
19
19
  next if url.nil?
20
20
 
21
21
  {
@@ -29,7 +29,7 @@ module Apollo
29
29
 
30
30
  def extract_links(doc)
31
31
  res = doc.xpath("(//div[@class = 'pager fl']/a)[last()]").map { |node|
32
- url = CrawlerBase.try_get_url(self.url, node['href'])
32
+ url = BaseCrawler.try_get_url(self.url, node['href'])
33
33
  next if url.nil?
34
34
 
35
35
  {
@@ -39,6 +39,6 @@ module Apollo
39
39
 
40
40
  return res.uniq
41
41
  end
42
- end
43
- end # Crawler
44
- end # Apollo
42
+ end # class StackoverflowCrawler
43
+ end # module Crawler
44
+ end # module Apollo
@@ -1,8 +1,8 @@
1
- require File.join(File.dirname(__FILE__), '..', 'crawler_base')
1
+ require File.join(File.dirname(__FILE__), 'base_crawler')
2
2
 
3
3
  module Apollo
4
4
  module Crawler
5
- class Xkcd < CrawlerBase
5
+ class XkcdCrawler < BaseCrawler
6
6
  @@MATCHER_ITEM = "//div[@id = 'comic']/img"
7
7
 
8
8
  def name()
@@ -30,6 +30,6 @@ module Apollo
30
30
  }
31
31
  res.uniq
32
32
  end
33
- end
34
- end # Crawler
35
- end # Apollo
33
+ end # class XkcdCrawler
34
+ end # module Crawler
35
+ end # module Apollo
@@ -0,0 +1,11 @@
1
+ module Apollo
2
+ module Fetcher
3
+ class BaseFetcher
4
+ def self.fetch(url)
5
+ # TODO: Throw exception ???
6
+ return nil
7
+ end
8
+
9
+ end # class BaseFetcher
10
+ end # module Fetcher
11
+ end # module Apollo
@@ -1,8 +1,15 @@
1
- require File.join(File.dirname(__FILE__), 'fetcher_base')
1
+ require "open-uri"
2
+ require "nokogiri"
3
+
4
+ require File.join(File.dirname(__FILE__), 'base_fetcher')
2
5
 
3
6
  module Apollo
4
7
  module Fetcher
5
- class SimpleFetcher < FetcherBase
6
- end # Crawler
7
- end # Fetcher
8
- end # Apollo
8
+ class SimpleFetcher < BaseFetcher
9
+ def self.fetch(url)
10
+ # TODO: Throw exception ???
11
+ return open(url).read
12
+ end
13
+ end # class SimpleFetcher
14
+ end # module Fetcher
15
+ end # module Apollo
@@ -0,0 +1,15 @@
1
+ require "open-uri"
2
+ require "nokogiri"
3
+
4
+ require File.join(File.dirname(__FILE__), 'base_fetcher')
5
+
6
+ module Apollo
7
+ module Fetcher
8
+ class SmartFetcher < BaseFetcher
9
+ def self.fetch(url)
10
+ # TODO: Throw exception ???
11
+ return open(url).read
12
+ end
13
+ end # class SimpleFetcher
14
+ end # module SmartFetcher
15
+ end # module Apollo
@@ -0,0 +1,9 @@
1
+ module Apollo
2
+ module Formatter
3
+ class BaseFormatter
4
+ def self.name_re()
5
+ return /formatter$/
6
+ end
7
+ end # class BaseFormatter
8
+ end # module Formatter
9
+ end # module Apollo
@@ -1,10 +1,10 @@
1
1
  require 'json'
2
2
 
3
- require File.join(File.dirname(__FILE__), 'formatter_base')
3
+ require File.join(File.dirname(__FILE__), 'base_formatter')
4
4
 
5
5
  module Apollo
6
6
  module Formatter
7
- class Json < FormatterBase
7
+ class JsonFormatter < BaseFormatter
8
8
  def format(obj)
9
9
  return Json.format(obj)
10
10
  end
@@ -12,6 +12,6 @@ module Apollo
12
12
  def self.format(obj)
13
13
  return JSON.pretty_generate(obj)
14
14
  end
15
- end
16
- end # Formatter
17
- end # Apollo
15
+ end # class JsonFormatter
16
+ end # module Formatter
17
+ end # module Apollo
@@ -0,0 +1,17 @@
1
+ require 'awesome_print'
2
+
3
+ require File.join(File.dirname(__FILE__), 'base_formatter')
4
+
5
+ module Apollo
6
+ module Formatter
7
+ class PlainFormatter < BaseFormatter
8
+ def format(obj)
9
+ return Plain.format(obj)
10
+ end
11
+
12
+ def self.format(obj)
13
+ return obj.inspect
14
+ end
15
+ end # class PlainFormatter
16
+ end # module Formatter
17
+ end # module Apollo
@@ -1,10 +1,10 @@
1
1
  require 'terminal-table'
2
2
 
3
- require File.join(File.dirname(__FILE__), 'formatter_base')
3
+ require File.join(File.dirname(__FILE__), 'base_formatter')
4
4
 
5
5
  module Apollo
6
6
  module Formatter
7
- class Table < FormatterBase
7
+ class TableFormatter < BaseFormatter
8
8
  def format(obj)
9
9
  return Table.format(obj)
10
10
  end
@@ -30,6 +30,6 @@ module Apollo
30
30
  table = Terminal::Table.new :headings => headings, :rows => rows
31
31
  return table
32
32
  end
33
- end
34
- end # Formatter
35
- end # Apollo
33
+ end # class TableFormatter
34
+ end # module Formatter
35
+ end # module Apollo
@@ -2,27 +2,34 @@
2
2
  require File.join(File.dirname(__FILE__), 'program')
3
3
 
4
4
  # Caches
5
- require File.join(File.dirname(__FILE__), 'cache/cache_base')
5
+ require File.join(File.dirname(__FILE__), 'cache/base_cache')
6
6
  require File.join(File.dirname(__FILE__), 'cache/factory')
7
- require File.join(File.dirname(__FILE__), 'cache/filesystem_cache')
8
7
  require File.join(File.dirname(__FILE__), 'cache/memcached_cache')
9
8
  require File.join(File.dirname(__FILE__), 'cache/memory_cache')
10
9
  require File.join(File.dirname(__FILE__), 'cache/null_cache')
11
10
 
12
11
  # Crawlers
13
- require File.join(File.dirname(__FILE__), 'crawler/crawler_base')
14
- require File.join(File.dirname(__FILE__), 'crawler/google_com/google')
15
- require File.join(File.dirname(__FILE__), 'crawler/slashdot_org/slashdot')
16
- require File.join(File.dirname(__FILE__), 'crawler/stackoverflow_com/stackoverflow')
17
- require File.join(File.dirname(__FILE__), 'crawler/xkcd_com/xkcd')
18
- require File.join(File.dirname(__FILE__), 'crawler/ycombinator_com/hacker_news')
12
+ require File.join(File.dirname(__FILE__), 'crawler/base_crawler')
13
+ require File.join(File.dirname(__FILE__), 'crawler/google_crawler')
14
+ require File.join(File.dirname(__FILE__), 'crawler/hacker_news_crawler')
15
+ require File.join(File.dirname(__FILE__), 'crawler/slashdot_crawler')
16
+ require File.join(File.dirname(__FILE__), 'crawler/stackoverflow_crawler')
17
+ require File.join(File.dirname(__FILE__), 'crawler/xkcd_crawler')
19
18
 
20
19
  # Fetchers
21
- require File.join(File.dirname(__FILE__), 'fetcher/fetcher_base')
20
+ require File.join(File.dirname(__FILE__), 'fetcher/base_fetcher')
22
21
  require File.join(File.dirname(__FILE__), 'fetcher/simple_fetcher')
22
+ require File.join(File.dirname(__FILE__), 'fetcher/smart_fetcher')
23
23
 
24
24
  # Formatters
25
- require File.join(File.dirname(__FILE__), 'formatter/formatter_base')
26
- require File.join(File.dirname(__FILE__), 'formatter/formatter_json')
27
- require File.join(File.dirname(__FILE__), 'formatter/formatter_plain')
28
- require File.join(File.dirname(__FILE__), 'formatter/formatter_table')
25
+ require File.join(File.dirname(__FILE__), 'formatter/base_formatter')
26
+ require File.join(File.dirname(__FILE__), 'formatter/json_formatter')
27
+ require File.join(File.dirname(__FILE__), 'formatter/plain_formatter')
28
+ require File.join(File.dirname(__FILE__), 'formatter/table_formatter')
29
+
30
+ # Loggers
31
+ require File.join(File.dirname(__FILE__), 'logger/base_logger')
32
+ require File.join(File.dirname(__FILE__), 'logger/console_logger')
33
+
34
+ # Stores
35
+ require File.join(File.dirname(__FILE__), 'store/base_store')
@@ -0,0 +1,6 @@
1
+ module Apollo
2
+ module Logger
3
+ class BaseLogger
4
+ end # class BaseLogger
5
+ end # module Logger
6
+ end # modole Apollo
@@ -0,0 +1,15 @@
1
+ require File.join(File.dirname(__FILE__), 'base_logger')
2
+
3
+ module Apollo
4
+ module Logger
5
+ class ConsoleLogger < BaseLogger
6
+ def log(msg)
7
+ puts msg
8
+ end
9
+
10
+ def self.log(msg)
11
+ return Logger.log(msg)
12
+ end
13
+ end # class ConsoleLogger
14
+ end # module Logger
15
+ end # module Apollo
@@ -125,7 +125,7 @@ module Apollo
125
125
  config = File.join(File.dirname(__FILE__), "config", "crawler.rb")
126
126
  if(File.exists?(config))
127
127
  if(@options[:verbose])
128
- puts "Loading config '#{config}'"
128
+ RbConfig::DEFAULT_LOGGER "Loading config '#{config}'"
129
129
  end
130
130
 
131
131
  # puts "Let's require '#{@options[:verbose]}'"
@@ -192,7 +192,8 @@ module Apollo
192
192
 
193
193
  tmp.each do |x|
194
194
  klass = Object.const_get('Apollo').const_get('Crawler').const_get(x)
195
- @crawlers.merge!({ x.downcase.to_s => klass})
195
+ name = x.to_s.downcase.gsub(Apollo::Crawler::BaseCrawler.name_re,"")
196
+ @crawlers.merge!({ name => klass})
196
197
  end
197
198
 
198
199
  if(@options[:verbose])
@@ -225,7 +226,8 @@ module Apollo
225
226
 
226
227
  tmp.each do |x|
227
228
  klass = Object.const_get('Apollo').const_get('Formatter').const_get(x)
228
- @formatters.merge!({ x.downcase.to_s => klass})
229
+ name = x.to_s.downcase.gsub(Apollo::Formatter::BaseFormatter.name_re,"")
230
+ @formatters.merge!({ name => klass})
229
231
  end
230
232
 
231
233
  if(@options[:verbose])
@@ -266,9 +268,10 @@ module Apollo
266
268
  end
267
269
 
268
270
  template_path = RbConfig::CRAWLER_TEMPLATE_PATH
271
+ puts template_path
269
272
  if(File.exists?(template_path) == false)
270
273
  puts "Template file '#{template_path}' does not exists!"
271
- return
274
+ return -1
272
275
  end
273
276
 
274
277
  if(@options[:verbose])
@@ -301,6 +304,8 @@ module Apollo
301
304
  end
302
305
  end
303
306
  end
307
+
308
+ return 0
304
309
  end
305
310
 
306
311
  def self.console_table(headings, rows)
@@ -342,8 +347,7 @@ module Apollo
342
347
  url = args.length > 0 ? args[0] : nil
343
348
  matcher = args.length > 1 ? args[1] : nil
344
349
 
345
- self.generate_crawler(name, url, matcher)
346
- return 0
350
+ return self.generate_crawler(name, url, matcher)
347
351
  end
348
352
 
349
353
  register_modules()
@@ -356,7 +360,8 @@ module Apollo
356
360
 
357
361
  # Look for specified formatter
358
362
  f = @formatters.select { |k, v|
359
- k.downcase == formatter_name.downcase
363
+ name = formatter_name.gsub(Apollo::Formatter::BaseFormatter::name_re, "")
364
+ k.downcase == name
360
365
  }
361
366
 
362
367
  if(f)
@@ -388,7 +393,9 @@ module Apollo
388
393
  end
389
394
 
390
395
  crawlers.each do |crawler|
391
- p = @crawlers[crawler.downcase]
396
+ crawler_name = crawler.downcase.gsub(Apollo::Crawler::BaseCrawler.name_re, "")
397
+
398
+ p = @crawlers[crawler_name]
392
399
  if(p == nil)
393
400
  puts "Invalid crawler name - '#{crawler}'"
394
401
  puts "See program help"
@@ -0,0 +1,6 @@
1
+ module Apollo
2
+ module Store
3
+ class BaseStore
4
+ end # class BaseStore
5
+ end # module Store
6
+ end # module Apollo
@@ -1,3 +1,3 @@
1
1
  module Apollo
2
- VERSION = '0.1.8'
2
+ VERSION = '0.1.9'
3
3
  end # Apollo
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: apollo-crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.8
4
+ version: 0.1.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tomas Korcak
@@ -227,30 +227,31 @@ executables:
227
227
  extensions: []
228
228
  extra_rdoc_files: []
229
229
  files:
230
+ - ./lib/apollo_crawler/fetcher/smart_fetcher.rb
230
231
  - ./lib/apollo_crawler/fetcher/simple_fetcher.rb
231
- - ./lib/apollo_crawler/fetcher/fetcher_base.rb
232
+ - ./lib/apollo_crawler/fetcher/base_fetcher.rb
232
233
  - ./lib/apollo_crawler/lib.rb
233
234
  - ./lib/apollo_crawler/version.rb
235
+ - ./lib/apollo_crawler/logger/console_logger.rb
236
+ - ./lib/apollo_crawler/logger/base_logger.rb
234
237
  - ./lib/apollo_crawler/program.rb
235
238
  - ./lib/apollo_crawler/config.rb
236
239
  - ./lib/apollo_crawler/cache/factory.rb
237
240
  - ./lib/apollo_crawler/cache/null_cache.rb
238
- - ./lib/apollo_crawler/cache/cache_base.rb
239
241
  - ./lib/apollo_crawler/cache/memory_cache.rb
240
- - ./lib/apollo_crawler/cache/filesystem_cache.rb
242
+ - ./lib/apollo_crawler/cache/base_cache.rb
241
243
  - ./lib/apollo_crawler/cache/memcached_cache.rb
242
- - ./lib/apollo_crawler/crawler/crawler_template.rb
243
- - ./lib/apollo_crawler/crawler/stackoverflow_com/stackoverflow.rb
244
- - ./lib/apollo_crawler/crawler/xkcd_com/xkcd.rb
245
- - ./lib/apollo_crawler/crawler/google_com/google.rb
246
- - ./lib/apollo_crawler/crawler/crawler_base.rb
247
- - ./lib/apollo_crawler/crawler/slashdot_org/slashdot.rb
248
- - ./lib/apollo_crawler/crawler/ycombinator_com/hacker_news.rb
249
- - ./lib/apollo_crawler/formatter/formatter_base.rb
250
- - ./lib/apollo_crawler/formatter/formatter_plain.rb
251
- - ./lib/apollo_crawler/formatter/formatter_json.rb
252
- - ./lib/apollo_crawler/formatter/formatter_table.rb
253
- - ./lib/apollo_crawler/store/store_base.rb
244
+ - ./lib/apollo_crawler/crawler/xkcd_crawler.rb
245
+ - ./lib/apollo_crawler/crawler/google_crawler.rb
246
+ - ./lib/apollo_crawler/crawler/slashdot_crawler.rb
247
+ - ./lib/apollo_crawler/crawler/hacker_news_crawler.rb
248
+ - ./lib/apollo_crawler/crawler/base_crawler.rb
249
+ - ./lib/apollo_crawler/crawler/stackoverflow_crawler.rb
250
+ - ./lib/apollo_crawler/formatter/table_formatter.rb
251
+ - ./lib/apollo_crawler/formatter/base_formatter.rb
252
+ - ./lib/apollo_crawler/formatter/json_formatter.rb
253
+ - ./lib/apollo_crawler/formatter/plain_formatter.rb
254
+ - ./lib/apollo_crawler/store/base_store.rb
254
255
  - ./lib/apollo_crawler.rb
255
256
  - bin/apollo-crawler
256
257
  homepage: https://github.com/korczis/apollo-crawler
@@ -1,37 +0,0 @@
1
- require File.join(File.dirname(__FILE__), 'cache_base')
2
-
3
- module Apollo
4
- module Cache
5
- class Filesystem < CacheBase
6
- def initialize
7
- # puts "This if Filesystem cache"
8
- end
9
-
10
- # Get value associated with key from cache
11
- def get(key, *args)
12
- # Not found, Create, cache and return
13
- if block_given?
14
- res = yield args
15
- end
16
-
17
- return res
18
- end
19
-
20
- # Set value associated with key
21
- # Return cached value
22
- def set(key, value)
23
- return value
24
- end
25
-
26
- # Check if cache contains specified key
27
- def contains(key)
28
- return false
29
- end
30
-
31
- # Invalidate key/value pair
32
- def invalidate(key)
33
- return true
34
- end
35
- end # Filesystem
36
- end # Cache
37
- end # Apollo
@@ -1,24 +0,0 @@
1
- module Apollo
2
- module Crawler
3
- class CRAWLER_CLASS_NAME < Crawler
4
- @@MATCHER_ITEM = "CRAWLER_MATCHER"
5
-
6
- def name()
7
- return "CRAWLER_NAME"
8
- end
9
-
10
- def url()
11
- return "CRAWLER_URL"
12
- end
13
-
14
- def extract_data(doc)
15
- res = doc.xpath(@@MATCHER_ITEM).map { |i|
16
- {
17
- :text => i.text,
18
- :link => URI.join(self.url, i['href'])
19
- }
20
- }
21
- end
22
- end # CRAWLER_CLASS_NAME
23
- end # Crawler
24
- end # Apollo
@@ -1,6 +0,0 @@
1
- module Apollo
2
- module Fetcher
3
- class FetcherBase
4
- end # FetcherBase
5
- end # Fetcher
6
- end # Apollo
@@ -1,6 +0,0 @@
1
- module Apollo
2
- module Formatter
3
- class FormatterBase
4
- end # FormatterBase
5
- end # Formatter
6
- end # Apollo
@@ -1,17 +0,0 @@
1
- require 'awesome_print'
2
-
3
- require File.join(File.dirname(__FILE__), 'formatter_base')
4
-
5
- module Apollo
6
- module Formatter
7
- class Plain < FormatterBase
8
- def format(obj)
9
- return Plain.format(obj)
10
- end
11
-
12
- def self.format(obj)
13
- return obj.inspect
14
- end
15
- end
16
- end # Formatter
17
- end # Apollo
@@ -1,6 +0,0 @@
1
- module Apollo
2
- module Store
3
- class StoreBase
4
- end # StoreBase
5
- end # Store
6
- end # Apollo