apollo-crawler 0.0.45 → 0.0.46

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/bin/apollo-crawler CHANGED
@@ -22,8 +22,8 @@ require 'terminal-table'
22
22
 
23
23
  require File.join(File.dirname(__FILE__), '..', 'lib', 'apollo_crawler', 'version')
24
24
 
25
- module Crawler
26
- class Program
25
+ module Apollo
26
+ class CrawlerProgram
27
27
  @@CRAWLERS_DIR = File.join(File.dirname(__FILE__), "..", "lib", "apollo_crawler", "crawlers")
28
28
  @@FORMATTERS_DIR = File.join(File.dirname(__FILE__), "..", "lib", "apollo_crawler", "formatters")
29
29
  @@CRAWLER_TEMPLATE_NAME = "crawler_template.rb"
@@ -141,12 +141,12 @@ module Crawler
141
141
  require file
142
142
  end
143
143
 
144
- tmp = Apollo::Crawler::Formatters.constants.select { |c|
145
- Class === Apollo::Crawler::Formatters.const_get(c)
144
+ tmp = Apollo::Formatters.constants.select { |c|
145
+ Class === Apollo::Formatters.const_get(c)
146
146
  }
147
147
 
148
148
  tmp.each do |x|
149
- klass = Object.const_get('Apollo').const_get('Crawler').const_get('Formatters').const_get(x)
149
+ klass = Object.const_get('Apollo').const_get('Formatters').const_get(x)
150
150
  @formatters.merge!({ x.downcase.to_s => klass})
151
151
  end
152
152
 
@@ -154,7 +154,7 @@ module Crawler
154
154
  @formatters.each do |formatter, klass|
155
155
  name = klass.new.class.name
156
156
 
157
- if name == "Apollo::Crawler::Formatters::Formatter"
157
+ if name == "Apollo::Formatters::Formatter"
158
158
  next
159
159
  end
160
160
 
@@ -174,12 +174,12 @@ module Crawler
174
174
  require file
175
175
  end
176
176
 
177
- tmp = Apollo::Crawler::Crawlers.constants.select { |c|
178
- Class === Apollo::Crawler::Crawlers.const_get(c)
177
+ tmp = Apollo::Crawlers.constants.select { |c|
178
+ Class === Apollo::Crawlers.const_get(c)
179
179
  }
180
180
 
181
181
  tmp.each do |x|
182
- klass = Object.const_get('Apollo').const_get('Crawler').const_get('Crawlers').const_get(x)
182
+ klass = Object.const_get('Apollo').const_get('Crawlers').const_get(x)
183
183
  @crawlers.merge!({ x.downcase.to_s => klass})
184
184
  end
185
185
 
@@ -187,7 +187,7 @@ module Crawler
187
187
  @crawlers.each do |crawler, klass|
188
188
  name = klass.new.class.name
189
189
 
190
- if name == "Apollo::Crawler::Crawlers::Crawler"
190
+ if name == "Apollo::Crawlers::Crawler"
191
191
  next
192
192
  end
193
193
 
@@ -347,7 +347,7 @@ module Crawler
347
347
  end
348
348
 
349
349
  if __FILE__ == $0
350
- Crawler::Program.new.run()
350
+ Apollo::CrawlerProgram.new.run()
351
351
  else
352
- Crawler::Program.new.run()
352
+ Apollo::CrawlerProgram.new.run()
353
353
  end
@@ -2,75 +2,73 @@ require "open-uri"
2
2
  require "nokogiri"
3
3
 
4
4
  module Apollo
5
- module Crawler
6
- module Crawlers
7
- class Crawler
5
+ module Crawlers
6
+ class Crawler
8
7
 
9
- # Name of the crawler
10
- def name
11
- return "Crawler Base"
8
+ # Name of the crawler
9
+ def name
10
+ return "Crawler Base"
11
+ end
12
+
13
+ def url
14
+ return nil
15
+ end
16
+
17
+ # - (0) Figure out URL
18
+ # - (1) Extract Data
19
+ # - (2) Extract Links
20
+ # - (3) Go to (0) eventually
21
+ def etl(url=nil)
22
+ # Look for passed URL use default instead and fail if it is not valid
23
+ url = url ? url : self.url
24
+ if(url.nil?)
25
+ return nil
12
26
  end
13
27
 
14
- def url
28
+ # Try fetch document
29
+ doc = self.fetch_document(url)
30
+ if(doc.nil?)
15
31
  return nil
16
32
  end
17
33
 
18
- # - (0) Figure out URL
19
- # - (1) Extract Data
20
- # - (2) Extract Links
21
- # - (3) Go to (0) eventually
22
- def etl(url=nil)
23
- # Look for passed URL use default instead and fail if it is not valid
24
- url = url ? url : self.url
25
- if(url.nil?)
26
- return nil
27
- end
34
+ # Try extract data from document
35
+ data = self.extract_data(doc)
28
36
 
29
- # Try fetch document
30
- doc = self.fetch_document(url)
31
- if(doc.nil?)
32
- return nil
33
- end
37
+ # Try extract links for another documents
38
+ links = self.extract_links(doc)
34
39
 
35
- # Try extract data from document
36
- data = self.extract_data(doc)
40
+ # Return ETL result
41
+ return {
42
+ :crawler => self.class.name,
43
+ :title => doc.title,
44
+ :data => data,
45
+ :links => links
46
+ }
47
+ end
37
48
 
38
- # Try extract links for another documents
39
- links = self.extract_links(doc)
49
+ # Fetch document
50
+ def fetch_document(url)
51
+ ic = Iconv.new("UTF-8//IGNORE", "UTF-8")
40
52
 
41
- # Return ETL result
42
- return {
43
- :crawler => self.class.name,
44
- :title => doc.title,
45
- :data => data,
46
- :links => links
47
- }
53
+ if(self.url.nil?)
54
+ return nil
48
55
  end
49
56
 
50
- # Fetch document
51
- def fetch_document(url)
52
- ic = Iconv.new("UTF-8//IGNORE", "UTF-8")
53
-
54
- if(self.url.nil?)
55
- return nil
56
- end
57
-
58
- # TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
59
- doc = Nokogiri::HTML(ic.iconv(open(self.url).read))
60
- return doc
61
- end
57
+ # TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
58
+ doc = Nokogiri::HTML(ic.iconv(open(self.url).read))
59
+ return doc
60
+ end
62
61
 
63
- # Extracts data from document
64
- def extract_data(doc)
65
- res = []
66
- return res
67
- end
62
+ # Extracts data from document
63
+ def extract_data(doc)
64
+ res = []
65
+ return res
66
+ end
68
67
 
69
- # Extract links to another documents from this document
70
- def extract_links(doc)
71
- res = []
72
- return res
73
- end
68
+ # Extract links to another documents from this document
69
+ def extract_links(doc)
70
+ res = []
71
+ return res
74
72
  end
75
73
  end
76
74
  end
@@ -1,28 +1,26 @@
1
1
  require 'iconv'
2
2
 
3
3
  module Apollo
4
- module Crawler
5
- module Crawlers
6
- class CRAWLER_CLASS_NAME < Apollo::Crawler::Crawlers::Crawler
7
- @@MATCHER_ITEM = "CRAWLER_MATCHER"
4
+ module Crawlers
5
+ class CRAWLER_CLASS_NAME < Apollo::Crawler::Crawlers::Crawler
6
+ @@MATCHER_ITEM = "CRAWLER_MATCHER"
8
7
 
9
- def name()
10
- return "CRAWLER_NAME"
11
- end
8
+ def name()
9
+ return "CRAWLER_NAME"
10
+ end
12
11
 
13
- def url()
14
- return "CRAWLER_URL"
15
- end
12
+ def url()
13
+ return "CRAWLER_URL"
14
+ end
16
15
 
17
- def extract_data(doc)
18
- res = doc.xpath(@@MATCHER_ITEM).map { |i|
19
- {
20
- :text => i.text,
21
- :link => URI.join(self.url, i['href'])
22
- }
16
+ def extract_data(doc)
17
+ res = doc.xpath(@@MATCHER_ITEM).map { |i|
18
+ {
19
+ :text => i.text,
20
+ :link => URI.join(self.url, i['href'])
23
21
  }
24
- end
22
+ }
25
23
  end
26
- end # Crawlers
27
- end # Crawler
24
+ end
25
+ end # Crawlers
28
26
  end # Apollo
@@ -3,28 +3,26 @@ require 'iconv'
3
3
  require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
4
4
 
5
5
  module Apollo
6
- module Crawler
7
- module Crawlers
8
- class Alexa < Apollo::Crawler::Crawlers::Crawler
9
- @@MATCHER_ITEM = "//div[@id = 'alphabetically']/ul/li/a"
6
+ module Crawlers
7
+ class Alexa < Apollo::Crawlers::Crawler
8
+ @@MATCHER_ITEM = "//div[@id = 'alphabetically']/ul/li/a"
10
9
 
11
- def name()
12
- return "Alexa Rank"
13
- end
10
+ def name()
11
+ return "Alexa Rank"
12
+ end
14
13
 
15
- def url()
16
- return "http://www.alexa.com/"
17
- end
14
+ def url()
15
+ return "http://www.alexa.com/"
16
+ end
18
17
 
19
- def extract_data(doc)
20
- res = doc.xpath(@@MATCHER_ITEM).map { |i|
21
- {
22
- :text => i.text,
23
- :link => URI.join(self.url, i['href'])
24
- }
18
+ def extract_data(doc)
19
+ res = doc.xpath(@@MATCHER_ITEM).map { |i|
20
+ {
21
+ :text => i.text,
22
+ :link => URI.join(self.url, i['href'])
25
23
  }
26
- end
24
+ }
27
25
  end
28
- end # Crawlers
29
- end # Crawler
26
+ end
27
+ end # Crawlers
30
28
  end # Apollo
@@ -3,28 +3,26 @@ require 'iconv'
3
3
  require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
4
4
 
5
5
  module Apollo
6
- module Crawler
7
- module Crawlers
8
- class Firmy < Apollo::Crawler::Crawlers::Crawler
9
- @@MATCHER_ITEM = "//div[@id = 'alphabetically']/ul/li/a"
6
+ module Crawlers
7
+ class Firmy < Apollo::Crawlers::Crawler
8
+ @@MATCHER_ITEM = "//div[@id = 'alphabetically']/ul/li/a"
10
9
 
11
- def name()
12
- return "Firmy.cz"
13
- end
10
+ def name()
11
+ return "Firmy.cz"
12
+ end
14
13
 
15
- def url()
16
- return "http://www.firmy.cz/"
17
- end
14
+ def url()
15
+ return "http://www.firmy.cz/"
16
+ end
18
17
 
19
- def extract_data(doc)
20
- res = doc.xpath(@@MATCHER_ITEM).map { |i|
21
- {
22
- :text => i.text,
23
- :link => URI.join(self.url, i['href'])
24
- }
18
+ def extract_data(doc)
19
+ res = doc.xpath(@@MATCHER_ITEM).map { |i|
20
+ {
21
+ :text => i.text,
22
+ :link => URI.join(self.url, i['href'])
25
23
  }
26
- end
24
+ }
27
25
  end
28
- end # Crawlers
29
- end # Crawler
26
+ end
27
+ end # Crawlers
30
28
  end # Apollo
@@ -3,28 +3,26 @@ require 'iconv'
3
3
  require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
4
4
 
5
5
  module Apollo
6
- module Crawler
7
- module Crawlers
8
- class Slashdot < Apollo::Crawler::Crawlers::Crawler
9
- @@MATCHER_ITEM = "//article/header/h2/span/a"
6
+ module Crawlers
7
+ class Slashdot < Apollo::Crawlers::Crawler
8
+ @@MATCHER_ITEM = "//article/header/h2/span/a"
10
9
 
11
- def name
12
- return "Slashdot"
13
- end
10
+ def name
11
+ return "Slashdot"
12
+ end
14
13
 
15
- def url()
16
- return"http://slashdot.org/"
17
- end
14
+ def url()
15
+ return"http://slashdot.org/"
16
+ end
18
17
 
19
- def extract_data(doc)
20
- res = doc.xpath(@@MATCHER_ITEM).map { |i|
21
- {
22
- :text => i.text,
23
- :link => URI.join(self.url, i['href'])
24
- }
18
+ def extract_data(doc)
19
+ res = doc.xpath(@@MATCHER_ITEM).map { |i|
20
+ {
21
+ :text => i.text,
22
+ :link => URI.join(self.url, i['href'])
25
23
  }
26
- end
24
+ }
27
25
  end
28
- end # Crawlers
29
- end # Crawler
26
+ end
27
+ end # Crawlers
30
28
  end # Apollo
@@ -4,27 +4,25 @@ require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
4
4
 
5
5
  module Apollo
6
6
  module Crawlers
7
- module Crawler
8
- class StackOverflow < Apollo::Crawler::Crawlers::Crawler
9
- @@MATCHER_ITEM = "//div[@class = 'summary']/h3/a"
7
+ class StackOverflow < Apollo::Crawlers::Crawler
8
+ @@MATCHER_ITEM = "//div[@class = 'summary']/h3/a"
10
9
 
11
- def name
12
- return "StackOverflow"
13
- end
10
+ def name
11
+ return "StackOverflow"
12
+ end
14
13
 
15
- def url()
16
- return "http://stackoverflow.com/"
17
- end
14
+ def url()
15
+ return "http://stackoverflow.com/"
16
+ end
18
17
 
19
- def extract_data(doc)
20
- res = doc.xpath(@@MATCHER_ITEM).map { |i|
21
- {
22
- :text => i.text,
23
- :link => URI.join(self.url, i['href'])
24
- }
18
+ def extract_data(doc)
19
+ res = doc.xpath(@@MATCHER_ITEM).map { |i|
20
+ {
21
+ :text => i.text,
22
+ :link => URI.join(self.url, i['href'])
25
23
  }
26
- end
24
+ }
27
25
  end
28
- end # Crawlers
29
- end # Crawler
26
+ end
27
+ end # Crawlers
30
28
  end # Apollo
@@ -3,28 +3,26 @@ require 'iconv'
3
3
  require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
4
4
 
5
5
  module Apollo
6
- module Crawler
7
- module Crawlers
8
- class Xkcd < Apollo::Crawler::Crawlers::Crawler
9
- @@MATCHER_ITEM = "//div[@id = 'comic']/img"
6
+ module Crawlers
7
+ class Xkcd < Apollo::Crawlers::Crawler
8
+ @@MATCHER_ITEM = "//div[@id = 'comic']/img"
10
9
 
11
- def name()
12
- return "Xkcd"
13
- end
10
+ def name()
11
+ return "Xkcd"
12
+ end
14
13
 
15
- def url()
16
- return "http://xkcd.com/"
17
- end
14
+ def url()
15
+ return "http://xkcd.com/"
16
+ end
18
17
 
19
- def extract_data(doc)
20
- res = doc.xpath(@@MATCHER_ITEM).map { |node|
21
- {
22
- :text => node['title'],
23
- :link => URI.join(self.url, node['src'])
24
- }
18
+ def extract_data(doc)
19
+ res = doc.xpath(@@MATCHER_ITEM).map { |node|
20
+ {
21
+ :text => node['title'],
22
+ :link => URI.join(self.url, node['src'])
25
23
  }
26
- end
24
+ }
27
25
  end
28
- end # Crawlers
29
- end # Crawler
26
+ end
27
+ end # Crawlers
30
28
  end # Apollo
@@ -3,28 +3,26 @@ require 'iconv'
3
3
  require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
4
4
 
5
5
  module Apollo
6
- module Crawler
7
- module Crawlers
8
- class HackerNews < Apollo::Crawler::Crawlers::Crawler
9
- @@MATCHER_ITEM = "//td[@class = 'title']/a"
6
+ module Crawlers
7
+ class HackerNews < Apollo::Crawlers::Crawler
8
+ @@MATCHER_ITEM = "//td[@class = 'title']/a"
10
9
 
11
- def name
12
- return "Hacker News"
13
- end
10
+ def name
11
+ return "Hacker News"
12
+ end
14
13
 
15
- def url()
16
- return "http://news.ycombinator.com/"
17
- end
14
+ def url()
15
+ return "http://news.ycombinator.com/"
16
+ end
18
17
 
19
- def extract_data(doc)
20
- res = doc.xpath(@@MATCHER_ITEM).map { |i|
21
- {
22
- :text => i.text,
23
- :link => URI.join(self.url, i['href'])
24
- }
18
+ def extract_data(doc)
19
+ res = doc.xpath(@@MATCHER_ITEM).map { |i|
20
+ {
21
+ :text => i.text,
22
+ :link => URI.join(self.url, i['href'])
25
23
  }
26
- end
24
+ }
27
25
  end
28
- end # Crawlers
29
- end # Crawler
26
+ end
27
+ end # Crawlers
30
28
  end # Apollo
@@ -1,8 +1,6 @@
1
1
  module Apollo
2
- module Crawler
3
- module Formatters
4
- class Formatter
5
- end
6
- end # Formatters
7
- end # Crawler
2
+ module Formatters
3
+ class Formatter
4
+ end
5
+ end # Formatters
8
6
  end # Apollo
@@ -3,17 +3,15 @@ require 'json'
3
3
  require File.join(File.dirname(__FILE__), '..', 'formatter')
4
4
 
5
5
  module Apollo
6
- module Crawler
7
- module Formatters
8
- class Json < Formatter
9
- def format(obj)
10
- return Json.format(obj)
11
- end
6
+ module Formatters
7
+ class Json < Formatter
8
+ def format(obj)
9
+ return Json.format(obj)
10
+ end
12
11
 
13
- def self.format(obj)
14
- return JSON.pretty_generate(obj)
15
- end
12
+ def self.format(obj)
13
+ return JSON.pretty_generate(obj)
16
14
  end
17
- end # Formatters
18
- end # Crawler
15
+ end
16
+ end # Formatters
19
17
  end # Apollo
@@ -3,17 +3,15 @@ require 'awesome_print'
3
3
  require File.join(File.dirname(__FILE__), '..', 'formatter')
4
4
 
5
5
  module Apollo
6
- module Crawler
7
- module Formatters
8
- class Plain < Formatter
9
- def format(obj)
10
- return Plain.format(obj)
11
- end
6
+ module Formatters
7
+ class Plain < Formatter
8
+ def format(obj)
9
+ return Plain.format(obj)
10
+ end
12
11
 
13
- def self.format(obj)
14
- return obj.inspect
15
- end
12
+ def self.format(obj)
13
+ return obj.inspect
16
14
  end
17
- end # Formatters
18
- end # Crawler
15
+ end
16
+ end # Formatters
19
17
  end # Apollo
@@ -3,34 +3,32 @@ require 'terminal-table'
3
3
  require File.join(File.dirname(__FILE__), '..', 'formatter')
4
4
 
5
5
  module Apollo
6
- module Crawler
7
- module Formatters
8
- class Table < Formatter
9
- def format(obj)
10
- return Table.format(obj)
6
+ module Formatters
7
+ class Table < Formatter
8
+ def format(obj)
9
+ return Table.format(obj)
10
+ end
11
+
12
+ def self.format(obj)
13
+ headings = []
14
+ if(obj[:data].length > 0)
15
+ headings = obj[:data][0].keys
11
16
  end
12
17
 
13
- def self.format(obj)
14
- headings = []
15
- if(obj[:data].length > 0)
16
- headings = obj[:data][0].keys
18
+ rows = []
19
+ obj[:data].each do |line|
20
+ data = []
21
+ headings.each do |column|
22
+ data << line[column]
17
23
  end
18
24
 
19
- rows = []
20
- obj[:data].each do |line|
21
- data = []
22
- headings.each do |column|
23
- data << line[column]
24
- end
25
-
26
- rows << data
27
- end
25
+ rows << data
26
+ end
28
27
 
29
28
 
30
- table = Terminal::Table.new :headings => headings, :rows => rows
31
- return table
32
- end
29
+ table = Terminal::Table.new :headings => headings, :rows => rows
30
+ return table
33
31
  end
34
- end # Formatters
35
- end # Crawler
32
+ end
33
+ end # Formatters
36
34
  end # Apollo
@@ -1,5 +1,5 @@
1
1
  module Apollo
2
2
  module Crawler
3
- VERSION = '0.0.45'
3
+ VERSION = '0.0.46'
4
4
  end # Crawler
5
5
  end # Apollo
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: apollo-crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.45
4
+ version: 0.0.46
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors: