apollo-crawler 0.1.8 → 0.1.9

Sign up to get free protection for your applications and to get access to all the features.
Files changed (34) hide show
  1. checksums.yaml +8 -8
  2. data/lib/apollo_crawler.rb +20 -13
  3. data/lib/apollo_crawler/cache/{cache_base.rb → base_cache.rb} +4 -4
  4. data/lib/apollo_crawler/cache/factory.rb +1 -1
  5. data/lib/apollo_crawler/cache/memcached_cache.rb +5 -5
  6. data/lib/apollo_crawler/cache/memory_cache.rb +5 -5
  7. data/lib/apollo_crawler/cache/null_cache.rb +3 -3
  8. data/lib/apollo_crawler/config.rb +25 -10
  9. data/lib/apollo_crawler/crawler/{crawler_base.rb → base_crawler.rb} +41 -22
  10. data/lib/apollo_crawler/crawler/{google_com/google.rb → google_crawler.rb} +7 -7
  11. data/lib/apollo_crawler/crawler/{ycombinator_com/hacker_news.rb → hacker_news_crawler.rb} +7 -7
  12. data/lib/apollo_crawler/crawler/{slashdot_org/slashdot.rb → slashdot_crawler.rb} +7 -7
  13. data/lib/apollo_crawler/crawler/{stackoverflow_com/stackoverflow.rb → stackoverflow_crawler.rb} +7 -7
  14. data/lib/apollo_crawler/crawler/{xkcd_com/xkcd.rb → xkcd_crawler.rb} +5 -5
  15. data/lib/apollo_crawler/fetcher/base_fetcher.rb +11 -0
  16. data/lib/apollo_crawler/fetcher/simple_fetcher.rb +12 -5
  17. data/lib/apollo_crawler/fetcher/smart_fetcher.rb +15 -0
  18. data/lib/apollo_crawler/formatter/base_formatter.rb +9 -0
  19. data/lib/apollo_crawler/formatter/{formatter_json.rb → json_formatter.rb} +5 -5
  20. data/lib/apollo_crawler/formatter/plain_formatter.rb +17 -0
  21. data/lib/apollo_crawler/formatter/{formatter_table.rb → table_formatter.rb} +5 -5
  22. data/lib/apollo_crawler/lib.rb +20 -13
  23. data/lib/apollo_crawler/logger/base_logger.rb +6 -0
  24. data/lib/apollo_crawler/logger/console_logger.rb +15 -0
  25. data/lib/apollo_crawler/program.rb +15 -8
  26. data/lib/apollo_crawler/store/base_store.rb +6 -0
  27. data/lib/apollo_crawler/version.rb +1 -1
  28. metadata +17 -16
  29. data/lib/apollo_crawler/cache/filesystem_cache.rb +0 -37
  30. data/lib/apollo_crawler/crawler/crawler_template.rb +0 -24
  31. data/lib/apollo_crawler/fetcher/fetcher_base.rb +0 -6
  32. data/lib/apollo_crawler/formatter/formatter_base.rb +0 -6
  33. data/lib/apollo_crawler/formatter/formatter_plain.rb +0 -17
  34. data/lib/apollo_crawler/store/store_base.rb +0 -6
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- YjEzODFmZTRlMzYyYzc1MjhkMzc4MzQ5ZDk4MWNmYWQ4NjhkNWZjNw==
4
+ Y2I3MDM1OWQ1NmU2ZDMzMTg0OGRhNTYzODc5Mzg4MDhhZTkxOWJlMQ==
5
5
  data.tar.gz: !binary |-
6
- YTc2ZTFhYmU3NjE1OGJjMTU1OTllN2JkY2ZjODBmOTdjNDgwMWVhMQ==
6
+ ZWY0N2M3ZjU5ZGNmZjgwMTdkNWI0Y2JhYmZjZmUwNDFjYTA5ZjAwOA==
7
7
  !binary "U0hBNTEy":
8
8
  metadata.gz: !binary |-
9
- OGI1YzJjZWU0MGY0MGVjN2MxYzczMGY4ODc4OWVhMTc0MTNlNTgwYjkwNzI4
10
- ZDIxYTBlMDJlMWEwMjMzYzBmOWUyYmQ5OWMwNGEwMWNlOGI5M2U1YjIxNjVi
11
- NDg2ZGM1NDBlNzI2ZTVlOTBjOGQ5NjljYTcwNmQ2NWZhY2E1OWI=
9
+ MjM1ZWUzMGUyYzBiYjdmNmFjMmE4OTU5MjA0NDE3YWJlZTE4OWQ0YTE5MWQx
10
+ ZDBmN2UzZjk3N2Y3NDYwOGNiMWFiY2JkY2I2ODJmYzFkYWU3YTYzYjI5YTI0
11
+ NDQxZDk4YzBlODBmNDg3MzRkMDU2OWY1ZmViNmUzYWFhZjZlNGU=
12
12
  data.tar.gz: !binary |-
13
- Mjk0MWJkNDc0ZmI1MWM0ZmRkZTA4Nzg5MmQzNjNjNGZiNTE2NTgwMzU4YzI5
14
- Nzc5YWUzZWVlODUwNjJjMTc3MzBlNWUxNDQ4YWEzYzZjYmM1MDdkYWQ0ZGIz
15
- NTRmMjQwMzEwZGM0NGJhODUxZjY3ZGQwODFhY2Y4M2I3ZDEzYTk=
13
+ MDIzY2RhMmFkNmI4MjBlYzkzNmZhOTA5NjM2NWUyM2YzODQwM2M0NGIzN2U1
14
+ OTgxNGYwYTdhMTIxZGVkYzNlYjE1M2U1N2NjMDdiN2I4MWNkMTMwMzRmMDMy
15
+ MWFiOWRhNGIyOTZhY2NmZDE2YTRjMzUxYjQyODU1NTAzNjQwM2M=
@@ -5,27 +5,34 @@
5
5
  require File.join(File.dirname(__FILE__), 'apollo_crawler/program')
6
6
 
7
7
  # Caches
8
- require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/cache_base')
8
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/base_cache')
9
9
  require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/factory')
10
- require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/filesystem_cache')
11
10
  require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/memcached_cache')
12
11
  require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/memory_cache')
13
12
  require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/null_cache')
14
13
 
15
14
  # Crawlers
16
- require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/crawler_base')
17
- require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/google_com/google')
18
- require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/slashdot_org/slashdot')
19
- require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/stackoverflow_com/stackoverflow')
20
- require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/xkcd_com/xkcd')
21
- require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/ycombinator_com/hacker_news')
15
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/base_crawler')
16
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/google_crawler')
17
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/hacker_news_crawler')
18
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/slashdot_crawler')
19
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/stackoverflow_crawler')
20
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/xkcd_crawler')
22
21
 
23
22
  # Fetchers
24
- require File.join(File.dirname(__FILE__), 'apollo_crawler/fetcher/fetcher_base')
23
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/fetcher/base_fetcher')
25
24
  require File.join(File.dirname(__FILE__), 'apollo_crawler/fetcher/simple_fetcher')
25
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/fetcher/smart_fetcher')
26
26
 
27
27
  # Formatters
28
- require File.join(File.dirname(__FILE__), 'apollo_crawler/formatter/formatter_base')
29
- require File.join(File.dirname(__FILE__), 'apollo_crawler/formatter/formatter_json')
30
- require File.join(File.dirname(__FILE__), 'apollo_crawler/formatter/formatter_plain')
31
- require File.join(File.dirname(__FILE__), 'apollo_crawler/formatter/formatter_table')
28
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/formatter/base_formatter')
29
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/formatter/json_formatter')
30
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/formatter/plain_formatter')
31
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/formatter/table_formatter')
32
+
33
+ # Loggers
34
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/logger/base_logger')
35
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/logger/console_logger')
36
+
37
+ # Stores
38
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/store/base_store')
@@ -1,6 +1,6 @@
1
1
  module Apollo
2
2
  module Cache
3
- class CacheBase
3
+ class BaseCache
4
4
  # Get value associated with key from cache
5
5
  def get(key, *args)
6
6
 
@@ -32,6 +32,6 @@ module Apollo
32
32
  def clear
33
33
  return
34
34
  end
35
- end # CacheBase
36
- end # Cache
37
- end # Apollo
35
+ end # class BaseCache
36
+ end # module Cache
37
+ end # module Apollo
@@ -2,7 +2,7 @@
2
2
  require File.join(File.dirname(__FILE__), '..', 'config')
3
3
 
4
4
  # Cache instance base class
5
- require File.join(File.dirname(__FILE__), 'cache_base')
5
+ require File.join(File.dirname(__FILE__), 'base_cache')
6
6
 
7
7
  # Factory uses singleton pattern
8
8
  require 'singleton'
@@ -1,10 +1,10 @@
1
- require File.join(File.dirname(__FILE__), 'cache_base')
1
+ require File.join(File.dirname(__FILE__), 'base_cache')
2
2
 
3
3
  require 'dalli'
4
4
 
5
5
  module Apollo
6
6
  module Cache
7
- class Memcached < CacheBase
7
+ class MemcachedCache < BaseCache
8
8
  @cache = nil
9
9
 
10
10
  def initialize
@@ -46,6 +46,6 @@ module Apollo
46
46
  def clear
47
47
  # TODO: Implement
48
48
  end
49
- end # Null
50
- end # Cache
51
- end # Apollo
49
+ end # class MemcachedCache
50
+ end # module Cache
51
+ end # module Apollo
@@ -1,8 +1,8 @@
1
- require File.join(File.dirname(__FILE__), 'cache_base')
1
+ require File.join(File.dirname(__FILE__), 'base_cache')
2
2
 
3
3
  module Apollo
4
4
  module Cache
5
- class Memory < CacheBase
5
+ class MemoryCache < BaseCache
6
6
  @cache = nil
7
7
 
8
8
  def initialize
@@ -41,6 +41,6 @@ module Apollo
41
41
  def clear
42
42
  @cache.clear
43
43
  end
44
- end # Null
45
- end # Cache
46
- end # Apollo
44
+ end # class MemoryCache
45
+ end # module Cache
46
+ end # module Apollo
@@ -1,8 +1,8 @@
1
- require File.join(File.dirname(__FILE__), 'cache_base')
1
+ require File.join(File.dirname(__FILE__), 'base_cache')
2
2
 
3
3
  module Apollo
4
4
  module Cache
5
- class Null < CacheBase
5
+ class NullCache < BaseCache
6
6
  # Get value associated with key from cache
7
7
  def get(key, *args)
8
8
  # Not found, Create, cache and return
@@ -28,6 +28,6 @@ module Apollo
28
28
  def invalidate(key)
29
29
  return true
30
30
  end
31
- end # Null
31
+ end # NullCache
32
32
  end # Cache
33
33
  end # Apollo
@@ -14,40 +14,55 @@ module RbConfig
14
14
  ############################################################
15
15
  #
16
16
  # Filesystem backend
17
- # CACHE_CLASS = Apollo::Cache::Filesystem
17
+ # CACHE_CLASS = Apollo::Cache::FilesystemCache
18
18
  #
19
19
  # Memcached - expects localhost:11211
20
- # CACHE_CLASS = Apollo::Cache::Memcached
20
+ # CACHE_CLASS = Apollo::Cache::MemcachedCache
21
21
  #
22
22
  # Pure naive ruby in-memory implementation
23
- # CACHE_CLASS = Apollo::Cache::Memory
23
+ # CACHE_CLASS = Apollo::Cache::MemoryCache
24
24
  #
25
25
  # Null caching - no caching at all
26
- # CACHE_CLASS = Apollo::Cache::Null
26
+ # CACHE_CLASS = Apollo::Cache::NullCache
27
27
 
28
28
  # Used caching mechanism by default
29
- CACHE_CLASS = Apollo::Cache::Memcached
29
+ CACHE_CLASS = Apollo::Cache::MemcachedCache
30
30
 
31
31
 
32
32
 
33
33
  ############################################################
34
34
  # Crawlers - Built-in out-of box working crawlers
35
35
  ############################################################
36
- CRAWLERS_DIR = File.join(File.dirname(__FILE__), "crawlers")
36
+ CRAWLERS_DIR = File.join(File.dirname(__FILE__), "crawler")
37
37
 
38
38
  # Template used for generated crawlers
39
- CRAWLER_TEMPLATE_NAME = "crawler_template.rb"
39
+ CRAWLER_TEMPLATE_NAME = "crawler_template.trb"
40
40
 
41
41
  # Path of template
42
- CRAWLER_TEMPLATE_PATH = File.join(File.dirname(__FILE__), "crawler_template.rb")
42
+ CRAWLER_TEMPLATE_PATH = File.join(CRAWLERS_DIR, CRAWLER_TEMPLATE_NAME)
43
43
 
44
44
 
45
45
 
46
+ ############################################################
47
+ # Fetchers - used for fetching documents
48
+ ############################################################
49
+ FETCHERS_DIR = File.join(File.dirname(__FILE__), "fetcher")
50
+
51
+ DEFAULT_FETCHER = Apollo::Fetcher::SmartFetcher
52
+
53
+
46
54
  ############################################################
47
55
  # Formatters - used for formatting crawled documents results
48
56
  ############################################################
49
- FORMATTERS_DIR = File.join(File.dirname(__FILE__), "formatters")
57
+ FORMATTERS_DIR = File.join(File.dirname(__FILE__), "formatter")
50
58
 
51
59
  # Default formatter if no other specified
52
- DEFAULT_FORMATTER = Apollo::Formatter::Json
60
+ DEFAULT_FORMATTER = Apollo::Formatter::JsonFormatter
61
+
62
+
63
+
64
+ ############################################################
65
+ # Loggers - used for formatting output messages
66
+ ############################################################
67
+ DEFAULT_LOGGER = Apollo::Logger::ConsoleLogger
53
68
  end # Config
@@ -3,13 +3,19 @@ require "nokogiri"
3
3
 
4
4
  module Apollo
5
5
  module Crawler
6
- class CrawlerBase
6
+ class BaseCrawler
7
+
8
+
7
9
  @backlog = nil
8
10
 
9
11
  def initialize
10
12
  @backlog = []
11
13
  end
12
14
 
15
+ def self.name_re()
16
+ return /crawler$/
17
+ end
18
+
13
19
  # Name of the crawler
14
20
  def name
15
21
  return "Crawler Base"
@@ -19,6 +25,10 @@ module Apollo
19
25
  return nil
20
26
  end
21
27
 
28
+ def self.fetch(url)
29
+ RbConfig::DEFAULT_FETCHER.fetch(url)
30
+ end
31
+
22
32
  def self.try_get_url(root, url)
23
33
  begin
24
34
  return URI.join(root, url)
@@ -28,7 +38,7 @@ module Apollo
28
38
  end
29
39
 
30
40
  def self.try_get_doc(root, url)
31
- doc = CrawlerBase.try_get_url(root, url)
41
+ doc = BaseCrawler.try_get_url(root, url)
32
42
 
33
43
  # TODO: Set experition header
34
44
  return {
@@ -47,16 +57,19 @@ module Apollo
47
57
  url = self.url
48
58
  end
49
59
 
60
+ # TODO: Be more agressive, use assert, it is clients responsibility!
50
61
  if(url.nil?)
51
62
  return nil
52
63
  end
53
64
 
65
+ # We support both - list of urls or single url
54
66
  if(url.kind_of?(Array))
55
67
  @backlog.concat(url)
56
68
  else
57
69
  @backlog << url
58
70
  end
59
71
 
72
+ # Counter of processed documents (pages)
60
73
  docs_processed = 0
61
74
 
62
75
  res = []
@@ -66,34 +79,40 @@ module Apollo
66
79
 
67
80
  # puts "Processing '#{url}'"
68
81
  doc = self.process_url(url)
69
- res << doc
70
82
 
71
- # TODO: Use log4r and log it only on info level
72
- # TODO: Add some async/callback signal for document processed
73
- if block_given?
74
- yield res
75
- end
83
+ # Increase counter of processed documents
84
+ docs_processed = docs_processed + 1
85
+
86
+ # Process document if was successfuly retreived
87
+ if(!doc.nil?)
88
+ # TODO: Use log4r and log it only on info level
89
+ if block_given?
90
+ yield doc
91
+ end
76
92
 
77
- if(!doc.nil? && !doc.empty?)
78
- doc[:links].each do |link|
79
- url = link[:link].to_s
80
- # TODO: Use log4r and log it only on info level
81
- #puts url
93
+ # Add document to queue of results
94
+ res << doc
82
95
 
83
- # TODO: Check if it is unique
84
- @backlog << url
96
+ # If
97
+ if(doc[:links].nil? == false)
98
+ doc[:links].each do |link|
99
+ url = link[:link].to_s
100
+ # TODO: Use log4r and log it only on info level
101
+ #puts url
102
+
103
+ # TODO: Check if it is unique
104
+ @backlog << url
105
+ end
85
106
  end
86
107
  end
87
108
 
88
- # Increase counter of processed documents
89
- docs_processed = docs_processed + 1
109
+ # Break if limit of documents to processed was reached
90
110
  break if opts[:doc_limit] && docs_processed >= opts[:doc_limit]
91
111
  end
92
112
  return res
93
113
  end
94
114
 
95
115
  def process_url(url)
96
- # Try fetch document
97
116
  doc = self.fetch_document(url)
98
117
  if(doc.nil?)
99
118
  return nil
@@ -140,7 +159,7 @@ module Apollo
140
159
  res = nil
141
160
  while(attempt_no < max_attempts && success == false) do
142
161
  begin
143
- res = open(url).read
162
+ res = BaseCrawler.fetch(url)
144
163
  success = true
145
164
  rescue Exception => e
146
165
  puts "EXCEPTION: Unable to fetch '#{url}', reason: '#{e.to_s}'"
@@ -170,6 +189,6 @@ module Apollo
170
189
  res = []
171
190
  return res
172
191
  end
173
- end # CrawlerBase
174
- end # Crawler
175
- end # Apollo
192
+ end # class BaseCrawler
193
+ end # module Crawler
194
+ end # module Apollo
@@ -1,8 +1,8 @@
1
- require File.join(File.dirname(__FILE__), '..', 'crawler_base')
1
+ require File.join(File.dirname(__FILE__), 'base_crawler')
2
2
 
3
3
  module Apollo
4
4
  module Crawler
5
- class Google < CrawlerBase
5
+ class GoogleCrawler < BaseCrawler
6
6
  @@MATCHER_ITEM = "//h3/a"
7
7
 
8
8
  def name()
@@ -15,7 +15,7 @@ module Apollo
15
15
 
16
16
  def extract_data(doc)
17
17
  res = doc.xpath(@@MATCHER_ITEM).map { | node |
18
- url = CrawlerBase.try_get_url(self.url, node['href'])
18
+ url = BaseCrawler.try_get_url(self.url, node['href'])
19
19
  next if url.nil?
20
20
 
21
21
  {
@@ -27,7 +27,7 @@ module Apollo
27
27
 
28
28
  def extract_links(doc)
29
29
  res = doc.xpath("(//td[@class = 'b']/a)[last()]").map { | node |
30
- res_doc = CrawlerBase.try_get_url(self.url, node['href'])
30
+ res_doc = BaseCrawler.try_get_url(self.url, node['href'])
31
31
  next if url.nil?
32
32
 
33
33
  {
@@ -35,6 +35,6 @@ module Apollo
35
35
  }
36
36
  }
37
37
  end
38
- end
39
- end # Crawler
40
- end # Apollo
38
+ end # class GoogleCrawler
39
+ end # module Crawler
40
+ end # module Apollo
@@ -1,8 +1,8 @@
1
- require File.join(File.dirname(__FILE__), '..', 'crawler_base')
1
+ require File.join(File.dirname(__FILE__), 'base_crawler')
2
2
 
3
3
  module Apollo
4
4
  module Crawler
5
- class HackerNews < CrawlerBase
5
+ class HackerNewsCrawler < BaseCrawler
6
6
  @@MATCHER_ITEM = "(//td[@class = 'title']/a)[not(position() > last() -1)]"
7
7
 
8
8
  def name()
@@ -15,7 +15,7 @@ module Apollo
15
15
 
16
16
  def extract_data(doc)
17
17
  res = doc.xpath(@@MATCHER_ITEM).map { |node|
18
- url = CrawlerBase.try_get_url(self.url, node['href'])
18
+ url = BaseCrawler.try_get_url(self.url, node['href'])
19
19
  next if url.nil?
20
20
 
21
21
  {
@@ -29,7 +29,7 @@ module Apollo
29
29
 
30
30
  def extract_links(doc)
31
31
  res = doc.xpath("(//a[@class = 'prevnextbutact'])").map { |node|
32
- url = CrawlerBase.try_get_url(self.url, node['href'])
32
+ url = BaseCrawler.try_get_url(self.url, node['href'])
33
33
  next if url.nil?
34
34
 
35
35
  {
@@ -39,6 +39,6 @@ module Apollo
39
39
 
40
40
  return res.uniq
41
41
  end
42
- end
43
- end # Crawler
44
- end # Apollo
42
+ end # class HackerNewsCrawler
43
+ end # module Crawler
44
+ end # module Apollo
@@ -1,8 +1,8 @@
1
- require File.join(File.dirname(__FILE__), '..', 'crawler_base')
1
+ require File.join(File.dirname(__FILE__), 'base_crawler')
2
2
 
3
3
  module Apollo
4
4
  module Crawler
5
- class Slashdot < CrawlerBase
5
+ class SlashdotCrawler < BaseCrawler
6
6
  @@MATCHER_ITEM = "//article/header/h2/span/a"
7
7
 
8
8
  def name()
@@ -15,7 +15,7 @@ module Apollo
15
15
 
16
16
  def extract_data(doc)
17
17
  res = doc.xpath(@@MATCHER_ITEM).map { | node |
18
- url = CrawlerBase.try_get_url(self.url, node['href'])
18
+ url = BaseCrawler.try_get_url(self.url, node['href'])
19
19
  next if url.nil?
20
20
 
21
21
  {
@@ -27,7 +27,7 @@ module Apollo
27
27
 
28
28
  def extract_links(doc)
29
29
  res = doc.xpath(@@MATCHER_ITEM).map { | node |
30
- url = CrawlerBase.try_get_url(self.url, node['href'])
30
+ url = BaseCrawler.try_get_url(self.url, node['href'])
31
31
  next if url.nil?
32
32
 
33
33
  {
@@ -35,6 +35,6 @@ module Apollo
35
35
  }
36
36
  }
37
37
  end
38
- end
39
- end # Crawler
40
- end # Apollo
38
+ end # class SlashdotCrawler
39
+ end # module Crawler
40
+ end # module Apollo
@@ -1,8 +1,8 @@
1
- require File.join(File.dirname(__FILE__), '..', 'crawler_base')
1
+ require File.join(File.dirname(__FILE__), 'base_crawler')
2
2
 
3
3
  module Apollo
4
4
  module Crawler
5
- class StackOverflow < CrawlerBase
5
+ class StackoverflowCrawler < BaseCrawler
6
6
  @@MATCHER_ITEM = "//div[@class = 'summary']/h3/a"
7
7
 
8
8
  def name()
@@ -15,7 +15,7 @@ module Apollo
15
15
 
16
16
  def extract_data(doc)
17
17
  res = doc.xpath(@@MATCHER_ITEM).map { |node|
18
- url = CrawlerBase.try_get_url(self.url, node['href'])
18
+ url = BaseCrawler.try_get_url(self.url, node['href'])
19
19
  next if url.nil?
20
20
 
21
21
  {
@@ -29,7 +29,7 @@ module Apollo
29
29
 
30
30
  def extract_links(doc)
31
31
  res = doc.xpath("(//div[@class = 'pager fl']/a)[last()]").map { |node|
32
- url = CrawlerBase.try_get_url(self.url, node['href'])
32
+ url = BaseCrawler.try_get_url(self.url, node['href'])
33
33
  next if url.nil?
34
34
 
35
35
  {
@@ -39,6 +39,6 @@ module Apollo
39
39
 
40
40
  return res.uniq
41
41
  end
42
- end
43
- end # Crawler
44
- end # Apollo
42
+ end # class StackoverflowCrawler
43
+ end # module Crawler
44
+ end # module Apollo
@@ -1,8 +1,8 @@
1
- require File.join(File.dirname(__FILE__), '..', 'crawler_base')
1
+ require File.join(File.dirname(__FILE__), 'base_crawler')
2
2
 
3
3
  module Apollo
4
4
  module Crawler
5
- class Xkcd < CrawlerBase
5
+ class XkcdCrawler < BaseCrawler
6
6
  @@MATCHER_ITEM = "//div[@id = 'comic']/img"
7
7
 
8
8
  def name()
@@ -30,6 +30,6 @@ module Apollo
30
30
  }
31
31
  res.uniq
32
32
  end
33
- end
34
- end # Crawler
35
- end # Apollo
33
+ end # class XkcdCrawler
34
+ end # module Crawler
35
+ end # module Apollo
@@ -0,0 +1,11 @@
1
+ module Apollo
2
+ module Fetcher
3
+ class BaseFetcher
4
+ def self.fetch(url)
5
+ # TODO: Throw exception ???
6
+ return nil
7
+ end
8
+
9
+ end # class BaseFetcher
10
+ end # module Fetcher
11
+ end # module Apollo
@@ -1,8 +1,15 @@
1
- require File.join(File.dirname(__FILE__), 'fetcher_base')
1
+ require "open-uri"
2
+ require "nokogiri"
3
+
4
+ require File.join(File.dirname(__FILE__), 'base_fetcher')
2
5
 
3
6
  module Apollo
4
7
  module Fetcher
5
- class SimpleFetcher < FetcherBase
6
- end # Crawler
7
- end # Fetcher
8
- end # Apollo
8
+ class SimpleFetcher < BaseFetcher
9
+ def self.fetch(url)
10
+ # TODO: Throw exception ???
11
+ return open(url).read
12
+ end
13
+ end # class SimpleFetcher
14
+ end # module Fetcher
15
+ end # module Apollo
@@ -0,0 +1,15 @@
1
+ require "open-uri"
2
+ require "nokogiri"
3
+
4
+ require File.join(File.dirname(__FILE__), 'base_fetcher')
5
+
6
+ module Apollo
7
+ module Fetcher
8
+ class SmartFetcher < BaseFetcher
9
+ def self.fetch(url)
10
+ # TODO: Throw exception ???
11
+ return open(url).read
12
+ end
13
+ end # class SimpleFetcher
14
+ end # module SmartFetcher
15
+ end # module Apollo
@@ -0,0 +1,9 @@
1
+ module Apollo
2
+ module Formatter
3
+ class BaseFormatter
4
+ def self.name_re()
5
+ return /formatter$/
6
+ end
7
+ end # class BaseFormatter
8
+ end # module Formatter
9
+ end # module Apollo
@@ -1,10 +1,10 @@
1
1
  require 'json'
2
2
 
3
- require File.join(File.dirname(__FILE__), 'formatter_base')
3
+ require File.join(File.dirname(__FILE__), 'base_formatter')
4
4
 
5
5
  module Apollo
6
6
  module Formatter
7
- class Json < FormatterBase
7
+ class JsonFormatter < BaseFormatter
8
8
  def format(obj)
9
9
  return Json.format(obj)
10
10
  end
@@ -12,6 +12,6 @@ module Apollo
12
12
  def self.format(obj)
13
13
  return JSON.pretty_generate(obj)
14
14
  end
15
- end
16
- end # Formatter
17
- end # Apollo
15
+ end # class JsonFormatter
16
+ end # module Formatter
17
+ end # module Apollo
@@ -0,0 +1,17 @@
1
+ require 'awesome_print'
2
+
3
+ require File.join(File.dirname(__FILE__), 'base_formatter')
4
+
5
+ module Apollo
6
+ module Formatter
7
+ class PlainFormatter < BaseFormatter
8
+ def format(obj)
9
+ return Plain.format(obj)
10
+ end
11
+
12
+ def self.format(obj)
13
+ return obj.inspect
14
+ end
15
+ end # class PlainFormatter
16
+ end # module Formatter
17
+ end # module Apollo
@@ -1,10 +1,10 @@
1
1
  require 'terminal-table'
2
2
 
3
- require File.join(File.dirname(__FILE__), 'formatter_base')
3
+ require File.join(File.dirname(__FILE__), 'base_formatter')
4
4
 
5
5
  module Apollo
6
6
  module Formatter
7
- class Table < FormatterBase
7
+ class TableFormatter < BaseFormatter
8
8
  def format(obj)
9
9
  return Table.format(obj)
10
10
  end
@@ -30,6 +30,6 @@ module Apollo
30
30
  table = Terminal::Table.new :headings => headings, :rows => rows
31
31
  return table
32
32
  end
33
- end
34
- end # Formatter
35
- end # Apollo
33
+ end # class TableFormatter
34
+ end # module Formatter
35
+ end # module Apollo
@@ -2,27 +2,34 @@
2
2
  require File.join(File.dirname(__FILE__), 'program')
3
3
 
4
4
  # Caches
5
- require File.join(File.dirname(__FILE__), 'cache/cache_base')
5
+ require File.join(File.dirname(__FILE__), 'cache/base_cache')
6
6
  require File.join(File.dirname(__FILE__), 'cache/factory')
7
- require File.join(File.dirname(__FILE__), 'cache/filesystem_cache')
8
7
  require File.join(File.dirname(__FILE__), 'cache/memcached_cache')
9
8
  require File.join(File.dirname(__FILE__), 'cache/memory_cache')
10
9
  require File.join(File.dirname(__FILE__), 'cache/null_cache')
11
10
 
12
11
  # Crawlers
13
- require File.join(File.dirname(__FILE__), 'crawler/crawler_base')
14
- require File.join(File.dirname(__FILE__), 'crawler/google_com/google')
15
- require File.join(File.dirname(__FILE__), 'crawler/slashdot_org/slashdot')
16
- require File.join(File.dirname(__FILE__), 'crawler/stackoverflow_com/stackoverflow')
17
- require File.join(File.dirname(__FILE__), 'crawler/xkcd_com/xkcd')
18
- require File.join(File.dirname(__FILE__), 'crawler/ycombinator_com/hacker_news')
12
+ require File.join(File.dirname(__FILE__), 'crawler/base_crawler')
13
+ require File.join(File.dirname(__FILE__), 'crawler/google_crawler')
14
+ require File.join(File.dirname(__FILE__), 'crawler/hacker_news_crawler')
15
+ require File.join(File.dirname(__FILE__), 'crawler/slashdot_crawler')
16
+ require File.join(File.dirname(__FILE__), 'crawler/stackoverflow_crawler')
17
+ require File.join(File.dirname(__FILE__), 'crawler/xkcd_crawler')
19
18
 
20
19
  # Fetchers
21
- require File.join(File.dirname(__FILE__), 'fetcher/fetcher_base')
20
+ require File.join(File.dirname(__FILE__), 'fetcher/base_fetcher')
22
21
  require File.join(File.dirname(__FILE__), 'fetcher/simple_fetcher')
22
+ require File.join(File.dirname(__FILE__), 'fetcher/smart_fetcher')
23
23
 
24
24
  # Formatters
25
- require File.join(File.dirname(__FILE__), 'formatter/formatter_base')
26
- require File.join(File.dirname(__FILE__), 'formatter/formatter_json')
27
- require File.join(File.dirname(__FILE__), 'formatter/formatter_plain')
28
- require File.join(File.dirname(__FILE__), 'formatter/formatter_table')
25
+ require File.join(File.dirname(__FILE__), 'formatter/base_formatter')
26
+ require File.join(File.dirname(__FILE__), 'formatter/json_formatter')
27
+ require File.join(File.dirname(__FILE__), 'formatter/plain_formatter')
28
+ require File.join(File.dirname(__FILE__), 'formatter/table_formatter')
29
+
30
+ # Loggers
31
+ require File.join(File.dirname(__FILE__), 'logger/base_logger')
32
+ require File.join(File.dirname(__FILE__), 'logger/console_logger')
33
+
34
+ # Stores
35
+ require File.join(File.dirname(__FILE__), 'store/base_store')
@@ -0,0 +1,6 @@
1
+ module Apollo
2
+ module Logger
3
+ class BaseLogger
4
+ end # class BaseLogger
5
+ end # module Logger
6
+ end # modole Apollo
@@ -0,0 +1,15 @@
1
+ require File.join(File.dirname(__FILE__), 'base_logger')
2
+
3
+ module Apollo
4
+ module Logger
5
+ class ConsoleLogger < BaseLogger
6
+ def log(msg)
7
+ puts msg
8
+ end
9
+
10
+ def self.log(msg)
11
+ return Logger.log(msg)
12
+ end
13
+ end # class ConsoleLogger
14
+ end # module Logger
15
+ end # module Apollo
@@ -125,7 +125,7 @@ module Apollo
125
125
  config = File.join(File.dirname(__FILE__), "config", "crawler.rb")
126
126
  if(File.exists?(config))
127
127
  if(@options[:verbose])
128
- puts "Loading config '#{config}'"
128
+ RbConfig::DEFAULT_LOGGER "Loading config '#{config}'"
129
129
  end
130
130
 
131
131
  # puts "Let's require '#{@options[:verbose]}'"
@@ -192,7 +192,8 @@ module Apollo
192
192
 
193
193
  tmp.each do |x|
194
194
  klass = Object.const_get('Apollo').const_get('Crawler').const_get(x)
195
- @crawlers.merge!({ x.downcase.to_s => klass})
195
+ name = x.to_s.downcase.gsub(Apollo::Crawler::BaseCrawler.name_re,"")
196
+ @crawlers.merge!({ name => klass})
196
197
  end
197
198
 
198
199
  if(@options[:verbose])
@@ -225,7 +226,8 @@ module Apollo
225
226
 
226
227
  tmp.each do |x|
227
228
  klass = Object.const_get('Apollo').const_get('Formatter').const_get(x)
228
- @formatters.merge!({ x.downcase.to_s => klass})
229
+ name = x.to_s.downcase.gsub(Apollo::Formatter::BaseFormatter.name_re,"")
230
+ @formatters.merge!({ name => klass})
229
231
  end
230
232
 
231
233
  if(@options[:verbose])
@@ -266,9 +268,10 @@ module Apollo
266
268
  end
267
269
 
268
270
  template_path = RbConfig::CRAWLER_TEMPLATE_PATH
271
+ puts template_path
269
272
  if(File.exists?(template_path) == false)
270
273
  puts "Template file '#{template_path}' does not exists!"
271
- return
274
+ return -1
272
275
  end
273
276
 
274
277
  if(@options[:verbose])
@@ -301,6 +304,8 @@ module Apollo
301
304
  end
302
305
  end
303
306
  end
307
+
308
+ return 0
304
309
  end
305
310
 
306
311
  def self.console_table(headings, rows)
@@ -342,8 +347,7 @@ module Apollo
342
347
  url = args.length > 0 ? args[0] : nil
343
348
  matcher = args.length > 1 ? args[1] : nil
344
349
 
345
- self.generate_crawler(name, url, matcher)
346
- return 0
350
+ return self.generate_crawler(name, url, matcher)
347
351
  end
348
352
 
349
353
  register_modules()
@@ -356,7 +360,8 @@ module Apollo
356
360
 
357
361
  # Look for specified formatter
358
362
  f = @formatters.select { |k, v|
359
- k.downcase == formatter_name.downcase
363
+ name = formatter_name.gsub(Apollo::Formatter::BaseFormatter::name_re, "")
364
+ k.downcase == name
360
365
  }
361
366
 
362
367
  if(f)
@@ -388,7 +393,9 @@ module Apollo
388
393
  end
389
394
 
390
395
  crawlers.each do |crawler|
391
- p = @crawlers[crawler.downcase]
396
+ crawler_name = crawler.downcase.gsub(Apollo::Crawler::BaseCrawler.name_re, "")
397
+
398
+ p = @crawlers[crawler_name]
392
399
  if(p == nil)
393
400
  puts "Invalid crawler name - '#{crawler}'"
394
401
  puts "See program help"
@@ -0,0 +1,6 @@
1
+ module Apollo
2
+ module Store
3
+ class BaseStore
4
+ end # class BaseStore
5
+ end # module Store
6
+ end # module Apollo
@@ -1,3 +1,3 @@
1
1
  module Apollo
2
- VERSION = '0.1.8'
2
+ VERSION = '0.1.9'
3
3
  end # Apollo
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: apollo-crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.8
4
+ version: 0.1.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tomas Korcak
@@ -227,30 +227,31 @@ executables:
227
227
  extensions: []
228
228
  extra_rdoc_files: []
229
229
  files:
230
+ - ./lib/apollo_crawler/fetcher/smart_fetcher.rb
230
231
  - ./lib/apollo_crawler/fetcher/simple_fetcher.rb
231
- - ./lib/apollo_crawler/fetcher/fetcher_base.rb
232
+ - ./lib/apollo_crawler/fetcher/base_fetcher.rb
232
233
  - ./lib/apollo_crawler/lib.rb
233
234
  - ./lib/apollo_crawler/version.rb
235
+ - ./lib/apollo_crawler/logger/console_logger.rb
236
+ - ./lib/apollo_crawler/logger/base_logger.rb
234
237
  - ./lib/apollo_crawler/program.rb
235
238
  - ./lib/apollo_crawler/config.rb
236
239
  - ./lib/apollo_crawler/cache/factory.rb
237
240
  - ./lib/apollo_crawler/cache/null_cache.rb
238
- - ./lib/apollo_crawler/cache/cache_base.rb
239
241
  - ./lib/apollo_crawler/cache/memory_cache.rb
240
- - ./lib/apollo_crawler/cache/filesystem_cache.rb
242
+ - ./lib/apollo_crawler/cache/base_cache.rb
241
243
  - ./lib/apollo_crawler/cache/memcached_cache.rb
242
- - ./lib/apollo_crawler/crawler/crawler_template.rb
243
- - ./lib/apollo_crawler/crawler/stackoverflow_com/stackoverflow.rb
244
- - ./lib/apollo_crawler/crawler/xkcd_com/xkcd.rb
245
- - ./lib/apollo_crawler/crawler/google_com/google.rb
246
- - ./lib/apollo_crawler/crawler/crawler_base.rb
247
- - ./lib/apollo_crawler/crawler/slashdot_org/slashdot.rb
248
- - ./lib/apollo_crawler/crawler/ycombinator_com/hacker_news.rb
249
- - ./lib/apollo_crawler/formatter/formatter_base.rb
250
- - ./lib/apollo_crawler/formatter/formatter_plain.rb
251
- - ./lib/apollo_crawler/formatter/formatter_json.rb
252
- - ./lib/apollo_crawler/formatter/formatter_table.rb
253
- - ./lib/apollo_crawler/store/store_base.rb
244
+ - ./lib/apollo_crawler/crawler/xkcd_crawler.rb
245
+ - ./lib/apollo_crawler/crawler/google_crawler.rb
246
+ - ./lib/apollo_crawler/crawler/slashdot_crawler.rb
247
+ - ./lib/apollo_crawler/crawler/hacker_news_crawler.rb
248
+ - ./lib/apollo_crawler/crawler/base_crawler.rb
249
+ - ./lib/apollo_crawler/crawler/stackoverflow_crawler.rb
250
+ - ./lib/apollo_crawler/formatter/table_formatter.rb
251
+ - ./lib/apollo_crawler/formatter/base_formatter.rb
252
+ - ./lib/apollo_crawler/formatter/json_formatter.rb
253
+ - ./lib/apollo_crawler/formatter/plain_formatter.rb
254
+ - ./lib/apollo_crawler/store/base_store.rb
254
255
  - ./lib/apollo_crawler.rb
255
256
  - bin/apollo-crawler
256
257
  homepage: https://github.com/korczis/apollo-crawler
@@ -1,37 +0,0 @@
1
- require File.join(File.dirname(__FILE__), 'cache_base')
2
-
3
- module Apollo
4
- module Cache
5
- class Filesystem < CacheBase
6
- def initialize
7
- # puts "This if Filesystem cache"
8
- end
9
-
10
- # Get value associated with key from cache
11
- def get(key, *args)
12
- # Not found, Create, cache and return
13
- if block_given?
14
- res = yield args
15
- end
16
-
17
- return res
18
- end
19
-
20
- # Set value associated with key
21
- # Return cached value
22
- def set(key, value)
23
- return value
24
- end
25
-
26
- # Check if cache contains specified key
27
- def contains(key)
28
- return false
29
- end
30
-
31
- # Invalidate key/value pair
32
- def invalidate(key)
33
- return true
34
- end
35
- end # Filesystem
36
- end # Cache
37
- end # Apollo
@@ -1,24 +0,0 @@
1
- module Apollo
2
- module Crawler
3
- class CRAWLER_CLASS_NAME < Crawler
4
- @@MATCHER_ITEM = "CRAWLER_MATCHER"
5
-
6
- def name()
7
- return "CRAWLER_NAME"
8
- end
9
-
10
- def url()
11
- return "CRAWLER_URL"
12
- end
13
-
14
- def extract_data(doc)
15
- res = doc.xpath(@@MATCHER_ITEM).map { |i|
16
- {
17
- :text => i.text,
18
- :link => URI.join(self.url, i['href'])
19
- }
20
- }
21
- end
22
- end # CRAWLER_CLASS_NAME
23
- end # Crawler
24
- end # Apollo
@@ -1,6 +0,0 @@
1
- module Apollo
2
- module Fetcher
3
- class FetcherBase
4
- end # FetcherBase
5
- end # Fetcher
6
- end # Apollo
@@ -1,6 +0,0 @@
1
- module Apollo
2
- module Formatter
3
- class FormatterBase
4
- end # FormatterBase
5
- end # Formatter
6
- end # Apollo
@@ -1,17 +0,0 @@
1
- require 'awesome_print'
2
-
3
- require File.join(File.dirname(__FILE__), 'formatter_base')
4
-
5
- module Apollo
6
- module Formatter
7
- class Plain < FormatterBase
8
- def format(obj)
9
- return Plain.format(obj)
10
- end
11
-
12
- def self.format(obj)
13
- return obj.inspect
14
- end
15
- end
16
- end # Formatter
17
- end # Apollo
@@ -1,6 +0,0 @@
1
- module Apollo
2
- module Store
3
- class StoreBase
4
- end # StoreBase
5
- end # Store
6
- end # Apollo