apollo-crawler 0.0.35 → 0.0.36

Sign up to get free protection for your applications and to get access to all the features.
@@ -251,7 +251,12 @@ module Crawler
251
251
  end
252
252
 
253
253
  # puts "Running '#{plugin}'"
254
- puts JSON.pretty_generate(p.new.run)
254
+ res = p.new.etl
255
+ if(res.nil?)
256
+ next
257
+ end
258
+
259
+ puts JSON.pretty_generate(res)
255
260
  end
256
261
  end
257
262
  end
@@ -11,6 +11,32 @@ module Apollo
11
11
  return "Plugin Base"
12
12
  end
13
13
 
14
+ def url
15
+ return nil
16
+ end
17
+
18
+ def etl()
19
+ #return run()
20
+ res = []
21
+
22
+ if(self.url.nil?)
23
+ return nil
24
+ end
25
+
26
+ doc = self.fetch_document(self.url)
27
+ if(doc.nil?)
28
+ return nil
29
+ end
30
+
31
+ res = self.extract_data(doc)
32
+
33
+ return {
34
+ :plugin => self.class.name,
35
+ :title => doc.title,
36
+ :data => res
37
+ }
38
+ end
39
+
14
40
  # - Fetch default URL (and transform it to document)
15
41
  # - Extract and Load (Store) important data
16
42
  # - Look for another documents
@@ -24,12 +50,29 @@ module Apollo
24
50
  }
25
51
  end
26
52
 
27
- # Extracts data from currently processed URL (called document here)
28
- def extract_doc_data
53
+ # Fetch document
54
+ def fetch_document(url)
55
+ ic = Iconv.new("UTF-8//IGNORE", "UTF-8")
56
+
57
+ if(self.url.nil?)
58
+ return nil
59
+ end
60
+
61
+ # TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
62
+ doc = Nokogiri::HTML(ic.iconv(open(self.url).read))
63
+ return doc
64
+ end
65
+
66
+ # Extracts data from document
67
+ def extract_data(doc)
68
+ res = []
69
+ return res
29
70
  end
30
71
 
31
- # This function tries to get links of another URLs (called leaf here) to crawl
32
- def fetch_leafs
72
+ # Extract links to another documents from this document
73
+ def extract_links(doc)
74
+ res = []
75
+ return res
33
76
  end
34
77
  end
35
78
  end
@@ -5,32 +5,23 @@ module Apollo
5
5
  module Plugins
6
6
  # PARAMATRIZE: Plugin class name
7
7
  class PLUGIN_CLASS_NAME < Plugin
8
- @@URL = "PLUGIN_URL"
9
-
10
8
  @@MATCHER_ITEM = "PLUGIN_MATCHER"
11
9
 
12
10
  def name()
13
11
  return "PLUGIN_NAME"
14
12
  end
15
13
 
16
- def run()
17
- ic = Iconv.new("UTF-8//IGNORE", "UTF-8")
14
+ def url()
15
+ return "PLUGIN_URL"
16
+ end
18
17
 
19
- # TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
20
- doc = Nokogiri::HTML(ic.iconv(open(@@URL).read))
21
-
18
+ def extract_data(doc)
22
19
  res = doc.xpath(@@MATCHER_ITEM).map { |i|
23
20
  {
24
21
  :text => i.text,
25
- :link => URI.join(@@URL, i['href'])
22
+ :link => URI.join(self.url, i['href'])
26
23
  }
27
24
  }
28
-
29
- return {
30
- :plugin => self.class.name,
31
- :title => doc.title,
32
- :res => res
33
- }
34
25
  end
35
26
  end
36
27
  end # Plugins
@@ -7,31 +7,23 @@ module Apollo
7
7
  module Plugins
8
8
  # PARAMATRIZE: Plugin class name
9
9
  class Alexa < Plugin
10
- @@URL = "http://www.alexa.com/"
11
-
12
10
  @@MATCHER_ITEM = "//div[@id = 'alphabetically']/ul/li/a"
13
11
 
14
12
  def name()
15
13
  return "Alexa Rank"
16
14
  end
17
15
 
18
- def run()
19
- ic = Iconv.new("UTF-8//IGNORE", "UTF-8")
16
+ def url()
17
+ return "http://www.alexa.com/"
18
+ end
20
19
 
21
- # TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
22
- doc = Nokogiri::HTML(ic.iconv(open(@@URL).read))
20
+ def extract_data(doc)
23
21
  res = doc.xpath(@@MATCHER_ITEM).map { |i|
24
22
  {
25
23
  :text => i.text,
26
- :link => URI.join(@@URL, i['href'])
24
+ :link => URI.join(self.url, i['href'])
27
25
  }
28
26
  }
29
-
30
- return {
31
- :plugin => self.class.name,
32
- :title => doc.title,
33
- :res => res
34
- }
35
27
  end
36
28
  end
37
29
  end # Plugins
@@ -7,32 +7,23 @@ module Apollo
7
7
  module Plugins
8
8
  # PARAMATRIZE: Plugin class name
9
9
  class Firmy < Plugin
10
- @@URL = "http://www.firmy.cz/"
11
-
12
10
  @@MATCHER_ITEM = "//div[@id = 'alphabetically']/ul/li/a"
13
11
 
14
12
  def name()
15
13
  return "Firmy.cz"
16
14
  end
17
15
 
18
- def run()
19
- ic = Iconv.new("UTF-8//IGNORE", "UTF-8")
20
-
21
- # TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
22
- doc = Nokogiri::HTML(ic.iconv(open(@@URL).read))
16
+ def url()
17
+ return "http://www.firmy.cz/"
18
+ end
23
19
 
20
+ def extract_data(doc)
24
21
  res = doc.xpath(@@MATCHER_ITEM).map { |i|
25
22
  {
26
23
  :text => i.text,
27
- :link => URI.join(@@URL, i['href'])
24
+ :link => URI.join(self.url, i['href'])
28
25
  }
29
26
  }
30
-
31
- return {
32
- :plugin => self.class.name,
33
- :title => doc.title,
34
- :res => res
35
- }
36
27
  end
37
28
  end
38
29
  end # Plugins
@@ -7,32 +7,23 @@ module Apollo
7
7
  module Plugins
8
8
  # PARAMATRIZE: Plugin class name
9
9
  class Slashdot < Plugin
10
- @@URL = "http://slashdot.org/"
11
-
12
10
  @@MATCHER_ITEM = "//article/header/h2/span/a"
13
11
 
14
12
  def name
15
13
  return "Slashdot"
16
14
  end
17
15
 
18
- def run()
19
- ic = Iconv.new("UTF-8//IGNORE", "UTF-8")
20
-
21
- # TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
22
- doc = Nokogiri::HTML(ic.iconv(open(@@URL).read))
16
+ def url()
17
+ return"http://slashdot.org/"
18
+ end
23
19
 
20
+ def extract_data(doc)
24
21
  res = doc.xpath(@@MATCHER_ITEM).map { |i|
25
22
  {
26
23
  :text => i.text,
27
- :link => URI.join(@@URL, i['href'])
24
+ :link => URI.join(self.url, i['href'])
28
25
  }
29
26
  }
30
-
31
- return {
32
- :plugin => self.class.name,
33
- :title => doc.title,
34
- :res => res
35
- }
36
27
  end
37
28
  end
38
29
  end # Plugins
@@ -7,32 +7,23 @@ module Apollo
7
7
  module Plugins
8
8
  # PARAMATRIZE: Plugin class name
9
9
  class Xkcd < Plugin
10
- @@URL = "http://xkcd.com/"
11
-
12
10
  @@MATCHER_ITEM = "//div[@id = 'comic']/img"
13
11
 
14
12
  def name()
15
13
  return "Xkcd"
16
14
  end
17
15
 
18
- def run()
19
- ic = Iconv.new("UTF-8//IGNORE", "UTF-8")
16
+ def url()
17
+ return "http://xkcd.com/"
18
+ end
20
19
 
21
- # TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
22
- doc = Nokogiri::HTML(ic.iconv(open(@@URL).read))
23
-
20
+ def extract_data(doc)
24
21
  res = doc.xpath(@@MATCHER_ITEM).map { |node|
25
22
  {
26
23
  :text => node['title'],
27
- :link => URI.join(@@URL, node['src'])
24
+ :link => URI.join(self.url, node['src'])
28
25
  }
29
26
  }
30
-
31
- return {
32
- :plugin => self.class.name,
33
- :title => doc.title,
34
- :res => res
35
- }
36
27
  end
37
28
  end
38
29
  end # Plugins
@@ -7,32 +7,23 @@ module Apollo
7
7
  module Plugins
8
8
  # PARAMATRIZE: Plugin class name
9
9
  class HackerNews < Plugin
10
- @@URL = "http://news.ycombinator.com/"
11
-
12
10
  @@MATCHER_ITEM = "//td[@class = 'title']/a"
13
11
 
14
12
  def name
15
13
  return "Hacker News"
16
14
  end
17
15
 
18
- def run()
19
- ic = Iconv.new("UTF-8//IGNORE", "UTF-8")
20
-
21
- # TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
22
- doc = Nokogiri::HTML(ic.iconv(open(@@URL).read))
16
+ def url()
17
+ return "http://news.ycombinator.com/"
18
+ end
23
19
 
20
+ def extract_data(doc)
24
21
  res = doc.xpath(@@MATCHER_ITEM).map { |i|
25
22
  {
26
23
  :text => i.text,
27
- :link => URI.join(@@URL, i['href'])
24
+ :link => URI.join(self.url, i['href'])
28
25
  }
29
26
  }
30
-
31
- return {
32
- :plugin => self.class.name,
33
- :title => doc.title,
34
- :res => res
35
- }
36
27
  end
37
28
  end
38
29
  end # Plugins
@@ -1,5 +1,5 @@
1
1
  module Apollo
2
2
  module Crawler
3
- VERSION = '0.0.35'
3
+ VERSION = '0.0.36'
4
4
  end # Crawler
5
5
  end # Apollo
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: apollo-crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.35
4
+ version: 0.0.36
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors: