apollo-crawler 0.0.45 → 0.0.46

Sign up to get free protection for your applications and to get access to all the features.
data/bin/apollo-crawler CHANGED
@@ -22,8 +22,8 @@ require 'terminal-table'
22
22
 
23
23
  require File.join(File.dirname(__FILE__), '..', 'lib', 'apollo_crawler', 'version')
24
24
 
25
- module Crawler
26
- class Program
25
+ module Apollo
26
+ class CrawlerProgram
27
27
  @@CRAWLERS_DIR = File.join(File.dirname(__FILE__), "..", "lib", "apollo_crawler", "crawlers")
28
28
  @@FORMATTERS_DIR = File.join(File.dirname(__FILE__), "..", "lib", "apollo_crawler", "formatters")
29
29
  @@CRAWLER_TEMPLATE_NAME = "crawler_template.rb"
@@ -141,12 +141,12 @@ module Crawler
141
141
  require file
142
142
  end
143
143
 
144
- tmp = Apollo::Crawler::Formatters.constants.select { |c|
145
- Class === Apollo::Crawler::Formatters.const_get(c)
144
+ tmp = Apollo::Formatters.constants.select { |c|
145
+ Class === Apollo::Formatters.const_get(c)
146
146
  }
147
147
 
148
148
  tmp.each do |x|
149
- klass = Object.const_get('Apollo').const_get('Crawler').const_get('Formatters').const_get(x)
149
+ klass = Object.const_get('Apollo').const_get('Formatters').const_get(x)
150
150
  @formatters.merge!({ x.downcase.to_s => klass})
151
151
  end
152
152
 
@@ -154,7 +154,7 @@ module Crawler
154
154
  @formatters.each do |formatter, klass|
155
155
  name = klass.new.class.name
156
156
 
157
- if name == "Apollo::Crawler::Formatters::Formatter"
157
+ if name == "Apollo::Formatters::Formatter"
158
158
  next
159
159
  end
160
160
 
@@ -174,12 +174,12 @@ module Crawler
174
174
  require file
175
175
  end
176
176
 
177
- tmp = Apollo::Crawler::Crawlers.constants.select { |c|
178
- Class === Apollo::Crawler::Crawlers.const_get(c)
177
+ tmp = Apollo::Crawlers.constants.select { |c|
178
+ Class === Apollo::Crawlers.const_get(c)
179
179
  }
180
180
 
181
181
  tmp.each do |x|
182
- klass = Object.const_get('Apollo').const_get('Crawler').const_get('Crawlers').const_get(x)
182
+ klass = Object.const_get('Apollo').const_get('Crawlers').const_get(x)
183
183
  @crawlers.merge!({ x.downcase.to_s => klass})
184
184
  end
185
185
 
@@ -187,7 +187,7 @@ module Crawler
187
187
  @crawlers.each do |crawler, klass|
188
188
  name = klass.new.class.name
189
189
 
190
- if name == "Apollo::Crawler::Crawlers::Crawler"
190
+ if name == "Apollo::Crawlers::Crawler"
191
191
  next
192
192
  end
193
193
 
@@ -347,7 +347,7 @@ module Crawler
347
347
  end
348
348
 
349
349
  if __FILE__ == $0
350
- Crawler::Program.new.run()
350
+ Apollo::CrawlerProgram.new.run()
351
351
  else
352
- Crawler::Program.new.run()
352
+ Apollo::CrawlerProgram.new.run()
353
353
  end
@@ -2,75 +2,73 @@ require "open-uri"
2
2
  require "nokogiri"
3
3
 
4
4
  module Apollo
5
- module Crawler
6
- module Crawlers
7
- class Crawler
5
+ module Crawlers
6
+ class Crawler
8
7
 
9
- # Name of the crawler
10
- def name
11
- return "Crawler Base"
8
+ # Name of the crawler
9
+ def name
10
+ return "Crawler Base"
11
+ end
12
+
13
+ def url
14
+ return nil
15
+ end
16
+
17
+ # - (0) Figure out URL
18
+ # - (1) Extract Data
19
+ # - (2) Extract Links
20
+ # - (3) Go to (0) eventually
21
+ def etl(url=nil)
22
+ # Look for passed URL use default instead and fail if it is not valid
23
+ url = url ? url : self.url
24
+ if(url.nil?)
25
+ return nil
12
26
  end
13
27
 
14
- def url
28
+ # Try fetch document
29
+ doc = self.fetch_document(url)
30
+ if(doc.nil?)
15
31
  return nil
16
32
  end
17
33
 
18
- # - (0) Figure out URL
19
- # - (1) Extract Data
20
- # - (2) Extract Links
21
- # - (3) Go to (0) eventually
22
- def etl(url=nil)
23
- # Look for passed URL use default instead and fail if it is not valid
24
- url = url ? url : self.url
25
- if(url.nil?)
26
- return nil
27
- end
34
+ # Try extract data from document
35
+ data = self.extract_data(doc)
28
36
 
29
- # Try fetch document
30
- doc = self.fetch_document(url)
31
- if(doc.nil?)
32
- return nil
33
- end
37
+ # Try extract links for another documents
38
+ links = self.extract_links(doc)
34
39
 
35
- # Try extract data from document
36
- data = self.extract_data(doc)
40
+ # Return ETL result
41
+ return {
42
+ :crawler => self.class.name,
43
+ :title => doc.title,
44
+ :data => data,
45
+ :links => links
46
+ }
47
+ end
37
48
 
38
- # Try extract links for another documents
39
- links = self.extract_links(doc)
49
+ # Fetch document
50
+ def fetch_document(url)
51
+ ic = Iconv.new("UTF-8//IGNORE", "UTF-8")
40
52
 
41
- # Return ETL result
42
- return {
43
- :crawler => self.class.name,
44
- :title => doc.title,
45
- :data => data,
46
- :links => links
47
- }
53
+ if(self.url.nil?)
54
+ return nil
48
55
  end
49
56
 
50
- # Fetch document
51
- def fetch_document(url)
52
- ic = Iconv.new("UTF-8//IGNORE", "UTF-8")
53
-
54
- if(self.url.nil?)
55
- return nil
56
- end
57
-
58
- # TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
59
- doc = Nokogiri::HTML(ic.iconv(open(self.url).read))
60
- return doc
61
- end
57
+ # TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
58
+ doc = Nokogiri::HTML(ic.iconv(open(self.url).read))
59
+ return doc
60
+ end
62
61
 
63
- # Extracts data from document
64
- def extract_data(doc)
65
- res = []
66
- return res
67
- end
62
+ # Extracts data from document
63
+ def extract_data(doc)
64
+ res = []
65
+ return res
66
+ end
68
67
 
69
- # Extract links to another documents from this document
70
- def extract_links(doc)
71
- res = []
72
- return res
73
- end
68
+ # Extract links to another documents from this document
69
+ def extract_links(doc)
70
+ res = []
71
+ return res
74
72
  end
75
73
  end
76
74
  end
@@ -1,28 +1,26 @@
1
1
  require 'iconv'
2
2
 
3
3
  module Apollo
4
- module Crawler
5
- module Crawlers
6
- class CRAWLER_CLASS_NAME < Apollo::Crawler::Crawlers::Crawler
7
- @@MATCHER_ITEM = "CRAWLER_MATCHER"
4
+ module Crawlers
5
+ class CRAWLER_CLASS_NAME < Apollo::Crawler::Crawlers::Crawler
6
+ @@MATCHER_ITEM = "CRAWLER_MATCHER"
8
7
 
9
- def name()
10
- return "CRAWLER_NAME"
11
- end
8
+ def name()
9
+ return "CRAWLER_NAME"
10
+ end
12
11
 
13
- def url()
14
- return "CRAWLER_URL"
15
- end
12
+ def url()
13
+ return "CRAWLER_URL"
14
+ end
16
15
 
17
- def extract_data(doc)
18
- res = doc.xpath(@@MATCHER_ITEM).map { |i|
19
- {
20
- :text => i.text,
21
- :link => URI.join(self.url, i['href'])
22
- }
16
+ def extract_data(doc)
17
+ res = doc.xpath(@@MATCHER_ITEM).map { |i|
18
+ {
19
+ :text => i.text,
20
+ :link => URI.join(self.url, i['href'])
23
21
  }
24
- end
22
+ }
25
23
  end
26
- end # Crawlers
27
- end # Crawler
24
+ end
25
+ end # Crawlers
28
26
  end # Apollo
@@ -3,28 +3,26 @@ require 'iconv'
3
3
  require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
4
4
 
5
5
  module Apollo
6
- module Crawler
7
- module Crawlers
8
- class Alexa < Apollo::Crawler::Crawlers::Crawler
9
- @@MATCHER_ITEM = "//div[@id = 'alphabetically']/ul/li/a"
6
+ module Crawlers
7
+ class Alexa < Apollo::Crawlers::Crawler
8
+ @@MATCHER_ITEM = "//div[@id = 'alphabetically']/ul/li/a"
10
9
 
11
- def name()
12
- return "Alexa Rank"
13
- end
10
+ def name()
11
+ return "Alexa Rank"
12
+ end
14
13
 
15
- def url()
16
- return "http://www.alexa.com/"
17
- end
14
+ def url()
15
+ return "http://www.alexa.com/"
16
+ end
18
17
 
19
- def extract_data(doc)
20
- res = doc.xpath(@@MATCHER_ITEM).map { |i|
21
- {
22
- :text => i.text,
23
- :link => URI.join(self.url, i['href'])
24
- }
18
+ def extract_data(doc)
19
+ res = doc.xpath(@@MATCHER_ITEM).map { |i|
20
+ {
21
+ :text => i.text,
22
+ :link => URI.join(self.url, i['href'])
25
23
  }
26
- end
24
+ }
27
25
  end
28
- end # Crawlers
29
- end # Crawler
26
+ end
27
+ end # Crawlers
30
28
  end # Apollo
@@ -3,28 +3,26 @@ require 'iconv'
3
3
  require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
4
4
 
5
5
  module Apollo
6
- module Crawler
7
- module Crawlers
8
- class Firmy < Apollo::Crawler::Crawlers::Crawler
9
- @@MATCHER_ITEM = "//div[@id = 'alphabetically']/ul/li/a"
6
+ module Crawlers
7
+ class Firmy < Apollo::Crawlers::Crawler
8
+ @@MATCHER_ITEM = "//div[@id = 'alphabetically']/ul/li/a"
10
9
 
11
- def name()
12
- return "Firmy.cz"
13
- end
10
+ def name()
11
+ return "Firmy.cz"
12
+ end
14
13
 
15
- def url()
16
- return "http://www.firmy.cz/"
17
- end
14
+ def url()
15
+ return "http://www.firmy.cz/"
16
+ end
18
17
 
19
- def extract_data(doc)
20
- res = doc.xpath(@@MATCHER_ITEM).map { |i|
21
- {
22
- :text => i.text,
23
- :link => URI.join(self.url, i['href'])
24
- }
18
+ def extract_data(doc)
19
+ res = doc.xpath(@@MATCHER_ITEM).map { |i|
20
+ {
21
+ :text => i.text,
22
+ :link => URI.join(self.url, i['href'])
25
23
  }
26
- end
24
+ }
27
25
  end
28
- end # Crawlers
29
- end # Crawler
26
+ end
27
+ end # Crawlers
30
28
  end # Apollo
@@ -3,28 +3,26 @@ require 'iconv'
3
3
  require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
4
4
 
5
5
  module Apollo
6
- module Crawler
7
- module Crawlers
8
- class Slashdot < Apollo::Crawler::Crawlers::Crawler
9
- @@MATCHER_ITEM = "//article/header/h2/span/a"
6
+ module Crawlers
7
+ class Slashdot < Apollo::Crawlers::Crawler
8
+ @@MATCHER_ITEM = "//article/header/h2/span/a"
10
9
 
11
- def name
12
- return "Slashdot"
13
- end
10
+ def name
11
+ return "Slashdot"
12
+ end
14
13
 
15
- def url()
16
- return"http://slashdot.org/"
17
- end
14
+ def url()
15
+ return"http://slashdot.org/"
16
+ end
18
17
 
19
- def extract_data(doc)
20
- res = doc.xpath(@@MATCHER_ITEM).map { |i|
21
- {
22
- :text => i.text,
23
- :link => URI.join(self.url, i['href'])
24
- }
18
+ def extract_data(doc)
19
+ res = doc.xpath(@@MATCHER_ITEM).map { |i|
20
+ {
21
+ :text => i.text,
22
+ :link => URI.join(self.url, i['href'])
25
23
  }
26
- end
24
+ }
27
25
  end
28
- end # Crawlers
29
- end # Crawler
26
+ end
27
+ end # Crawlers
30
28
  end # Apollo
@@ -4,27 +4,25 @@ require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
4
4
 
5
5
  module Apollo
6
6
  module Crawlers
7
- module Crawler
8
- class StackOverflow < Apollo::Crawler::Crawlers::Crawler
9
- @@MATCHER_ITEM = "//div[@class = 'summary']/h3/a"
7
+ class StackOverflow < Apollo::Crawlers::Crawler
8
+ @@MATCHER_ITEM = "//div[@class = 'summary']/h3/a"
10
9
 
11
- def name
12
- return "StackOverflow"
13
- end
10
+ def name
11
+ return "StackOverflow"
12
+ end
14
13
 
15
- def url()
16
- return "http://stackoverflow.com/"
17
- end
14
+ def url()
15
+ return "http://stackoverflow.com/"
16
+ end
18
17
 
19
- def extract_data(doc)
20
- res = doc.xpath(@@MATCHER_ITEM).map { |i|
21
- {
22
- :text => i.text,
23
- :link => URI.join(self.url, i['href'])
24
- }
18
+ def extract_data(doc)
19
+ res = doc.xpath(@@MATCHER_ITEM).map { |i|
20
+ {
21
+ :text => i.text,
22
+ :link => URI.join(self.url, i['href'])
25
23
  }
26
- end
24
+ }
27
25
  end
28
- end # Crawlers
29
- end # Crawler
26
+ end
27
+ end # Crawlers
30
28
  end # Apollo
@@ -3,28 +3,26 @@ require 'iconv'
3
3
  require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
4
4
 
5
5
  module Apollo
6
- module Crawler
7
- module Crawlers
8
- class Xkcd < Apollo::Crawler::Crawlers::Crawler
9
- @@MATCHER_ITEM = "//div[@id = 'comic']/img"
6
+ module Crawlers
7
+ class Xkcd < Apollo::Crawlers::Crawler
8
+ @@MATCHER_ITEM = "//div[@id = 'comic']/img"
10
9
 
11
- def name()
12
- return "Xkcd"
13
- end
10
+ def name()
11
+ return "Xkcd"
12
+ end
14
13
 
15
- def url()
16
- return "http://xkcd.com/"
17
- end
14
+ def url()
15
+ return "http://xkcd.com/"
16
+ end
18
17
 
19
- def extract_data(doc)
20
- res = doc.xpath(@@MATCHER_ITEM).map { |node|
21
- {
22
- :text => node['title'],
23
- :link => URI.join(self.url, node['src'])
24
- }
18
+ def extract_data(doc)
19
+ res = doc.xpath(@@MATCHER_ITEM).map { |node|
20
+ {
21
+ :text => node['title'],
22
+ :link => URI.join(self.url, node['src'])
25
23
  }
26
- end
24
+ }
27
25
  end
28
- end # Crawlers
29
- end # Crawler
26
+ end
27
+ end # Crawlers
30
28
  end # Apollo
@@ -3,28 +3,26 @@ require 'iconv'
3
3
  require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
4
4
 
5
5
  module Apollo
6
- module Crawler
7
- module Crawlers
8
- class HackerNews < Apollo::Crawler::Crawlers::Crawler
9
- @@MATCHER_ITEM = "//td[@class = 'title']/a"
6
+ module Crawlers
7
+ class HackerNews < Apollo::Crawlers::Crawler
8
+ @@MATCHER_ITEM = "//td[@class = 'title']/a"
10
9
 
11
- def name
12
- return "Hacker News"
13
- end
10
+ def name
11
+ return "Hacker News"
12
+ end
14
13
 
15
- def url()
16
- return "http://news.ycombinator.com/"
17
- end
14
+ def url()
15
+ return "http://news.ycombinator.com/"
16
+ end
18
17
 
19
- def extract_data(doc)
20
- res = doc.xpath(@@MATCHER_ITEM).map { |i|
21
- {
22
- :text => i.text,
23
- :link => URI.join(self.url, i['href'])
24
- }
18
+ def extract_data(doc)
19
+ res = doc.xpath(@@MATCHER_ITEM).map { |i|
20
+ {
21
+ :text => i.text,
22
+ :link => URI.join(self.url, i['href'])
25
23
  }
26
- end
24
+ }
27
25
  end
28
- end # Crawlers
29
- end # Crawler
26
+ end
27
+ end # Crawlers
30
28
  end # Apollo
@@ -1,8 +1,6 @@
1
1
  module Apollo
2
- module Crawler
3
- module Formatters
4
- class Formatter
5
- end
6
- end # Formatters
7
- end # Crawler
2
+ module Formatters
3
+ class Formatter
4
+ end
5
+ end # Formatters
8
6
  end # Apollo
@@ -3,17 +3,15 @@ require 'json'
3
3
  require File.join(File.dirname(__FILE__), '..', 'formatter')
4
4
 
5
5
  module Apollo
6
- module Crawler
7
- module Formatters
8
- class Json < Formatter
9
- def format(obj)
10
- return Json.format(obj)
11
- end
6
+ module Formatters
7
+ class Json < Formatter
8
+ def format(obj)
9
+ return Json.format(obj)
10
+ end
12
11
 
13
- def self.format(obj)
14
- return JSON.pretty_generate(obj)
15
- end
12
+ def self.format(obj)
13
+ return JSON.pretty_generate(obj)
16
14
  end
17
- end # Formatters
18
- end # Crawler
15
+ end
16
+ end # Formatters
19
17
  end # Apollo
@@ -3,17 +3,15 @@ require 'awesome_print'
3
3
  require File.join(File.dirname(__FILE__), '..', 'formatter')
4
4
 
5
5
  module Apollo
6
- module Crawler
7
- module Formatters
8
- class Plain < Formatter
9
- def format(obj)
10
- return Plain.format(obj)
11
- end
6
+ module Formatters
7
+ class Plain < Formatter
8
+ def format(obj)
9
+ return Plain.format(obj)
10
+ end
12
11
 
13
- def self.format(obj)
14
- return obj.inspect
15
- end
12
+ def self.format(obj)
13
+ return obj.inspect
16
14
  end
17
- end # Formatters
18
- end # Crawler
15
+ end
16
+ end # Formatters
19
17
  end # Apollo
@@ -3,34 +3,32 @@ require 'terminal-table'
3
3
  require File.join(File.dirname(__FILE__), '..', 'formatter')
4
4
 
5
5
  module Apollo
6
- module Crawler
7
- module Formatters
8
- class Table < Formatter
9
- def format(obj)
10
- return Table.format(obj)
6
+ module Formatters
7
+ class Table < Formatter
8
+ def format(obj)
9
+ return Table.format(obj)
10
+ end
11
+
12
+ def self.format(obj)
13
+ headings = []
14
+ if(obj[:data].length > 0)
15
+ headings = obj[:data][0].keys
11
16
  end
12
17
 
13
- def self.format(obj)
14
- headings = []
15
- if(obj[:data].length > 0)
16
- headings = obj[:data][0].keys
18
+ rows = []
19
+ obj[:data].each do |line|
20
+ data = []
21
+ headings.each do |column|
22
+ data << line[column]
17
23
  end
18
24
 
19
- rows = []
20
- obj[:data].each do |line|
21
- data = []
22
- headings.each do |column|
23
- data << line[column]
24
- end
25
-
26
- rows << data
27
- end
25
+ rows << data
26
+ end
28
27
 
29
28
 
30
- table = Terminal::Table.new :headings => headings, :rows => rows
31
- return table
32
- end
29
+ table = Terminal::Table.new :headings => headings, :rows => rows
30
+ return table
33
31
  end
34
- end # Formatters
35
- end # Crawler
32
+ end
33
+ end # Formatters
36
34
  end # Apollo
@@ -1,5 +1,5 @@
1
1
  module Apollo
2
2
  module Crawler
3
- VERSION = '0.0.45'
3
+ VERSION = '0.0.46'
4
4
  end # Crawler
5
5
  end # Apollo
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: apollo-crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.45
4
+ version: 0.0.46
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors: