apollo-crawler 0.0.35 → 0.0.36

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -251,7 +251,12 @@ module Crawler
251
251
  end
252
252
 
253
253
  # puts "Running '#{plugin}'"
254
- puts JSON.pretty_generate(p.new.run)
254
+ res = p.new.etl
255
+ if(res.nil?)
256
+ next
257
+ end
258
+
259
+ puts JSON.pretty_generate(res)
255
260
  end
256
261
  end
257
262
  end
@@ -11,6 +11,32 @@ module Apollo
11
11
  return "Plugin Base"
12
12
  end
13
13
 
14
+ def url
15
+ return nil
16
+ end
17
+
18
+ def etl()
19
+ #return run()
20
+ res = []
21
+
22
+ if(self.url.nil?)
23
+ return nil
24
+ end
25
+
26
+ doc = self.fetch_document(self.url)
27
+ if(doc.nil?)
28
+ return nil
29
+ end
30
+
31
+ res = self.extract_data(doc)
32
+
33
+ return {
34
+ :plugin => self.class.name,
35
+ :title => doc.title,
36
+ :data => res
37
+ }
38
+ end
39
+
14
40
  # - Fetch default URL (and transform it to document)
15
41
  # - Extract and Load (Store) important data
16
42
  # - Look for another documents
@@ -24,12 +50,29 @@ module Apollo
24
50
  }
25
51
  end
26
52
 
27
- # Extracts data from currently processed URL (called document here)
28
- def extract_doc_data
53
+ # Fetch document
54
+ def fetch_document(url)
55
+ ic = Iconv.new("UTF-8//IGNORE", "UTF-8")
56
+
57
+ if(self.url.nil?)
58
+ return nil
59
+ end
60
+
61
+ # TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
62
+ doc = Nokogiri::HTML(ic.iconv(open(self.url).read))
63
+ return doc
64
+ end
65
+
66
+ # Extracts data from document
67
+ def extract_data(doc)
68
+ res = []
69
+ return res
29
70
  end
30
71
 
31
- # This function tries to get links of another URLs (called leaf here) to crawl
32
- def fetch_leafs
72
+ # Extract links to another documents from this document
73
+ def extract_links(doc)
74
+ res = []
75
+ return res
33
76
  end
34
77
  end
35
78
  end
@@ -5,32 +5,23 @@ module Apollo
5
5
  module Plugins
6
6
  # PARAMATRIZE: Plugin class name
7
7
  class PLUGIN_CLASS_NAME < Plugin
8
- @@URL = "PLUGIN_URL"
9
-
10
8
  @@MATCHER_ITEM = "PLUGIN_MATCHER"
11
9
 
12
10
  def name()
13
11
  return "PLUGIN_NAME"
14
12
  end
15
13
 
16
- def run()
17
- ic = Iconv.new("UTF-8//IGNORE", "UTF-8")
14
+ def url()
15
+ return "PLUGIN_URL"
16
+ end
18
17
 
19
- # TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
20
- doc = Nokogiri::HTML(ic.iconv(open(@@URL).read))
21
-
18
+ def extract_data(doc)
22
19
  res = doc.xpath(@@MATCHER_ITEM).map { |i|
23
20
  {
24
21
  :text => i.text,
25
- :link => URI.join(@@URL, i['href'])
22
+ :link => URI.join(self.url, i['href'])
26
23
  }
27
24
  }
28
-
29
- return {
30
- :plugin => self.class.name,
31
- :title => doc.title,
32
- :res => res
33
- }
34
25
  end
35
26
  end
36
27
  end # Plugins
@@ -7,31 +7,23 @@ module Apollo
7
7
  module Plugins
8
8
  # PARAMATRIZE: Plugin class name
9
9
  class Alexa < Plugin
10
- @@URL = "http://www.alexa.com/"
11
-
12
10
  @@MATCHER_ITEM = "//div[@id = 'alphabetically']/ul/li/a"
13
11
 
14
12
  def name()
15
13
  return "Alexa Rank"
16
14
  end
17
15
 
18
- def run()
19
- ic = Iconv.new("UTF-8//IGNORE", "UTF-8")
16
+ def url()
17
+ return "http://www.alexa.com/"
18
+ end
20
19
 
21
- # TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
22
- doc = Nokogiri::HTML(ic.iconv(open(@@URL).read))
20
+ def extract_data(doc)
23
21
  res = doc.xpath(@@MATCHER_ITEM).map { |i|
24
22
  {
25
23
  :text => i.text,
26
- :link => URI.join(@@URL, i['href'])
24
+ :link => URI.join(self.url, i['href'])
27
25
  }
28
26
  }
29
-
30
- return {
31
- :plugin => self.class.name,
32
- :title => doc.title,
33
- :res => res
34
- }
35
27
  end
36
28
  end
37
29
  end # Plugins
@@ -7,32 +7,23 @@ module Apollo
7
7
  module Plugins
8
8
  # PARAMATRIZE: Plugin class name
9
9
  class Firmy < Plugin
10
- @@URL = "http://www.firmy.cz/"
11
-
12
10
  @@MATCHER_ITEM = "//div[@id = 'alphabetically']/ul/li/a"
13
11
 
14
12
  def name()
15
13
  return "Firmy.cz"
16
14
  end
17
15
 
18
- def run()
19
- ic = Iconv.new("UTF-8//IGNORE", "UTF-8")
20
-
21
- # TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
22
- doc = Nokogiri::HTML(ic.iconv(open(@@URL).read))
16
+ def url()
17
+ return "http://www.firmy.cz/"
18
+ end
23
19
 
20
+ def extract_data(doc)
24
21
  res = doc.xpath(@@MATCHER_ITEM).map { |i|
25
22
  {
26
23
  :text => i.text,
27
- :link => URI.join(@@URL, i['href'])
24
+ :link => URI.join(self.url, i['href'])
28
25
  }
29
26
  }
30
-
31
- return {
32
- :plugin => self.class.name,
33
- :title => doc.title,
34
- :res => res
35
- }
36
27
  end
37
28
  end
38
29
  end # Plugins
@@ -7,32 +7,23 @@ module Apollo
7
7
  module Plugins
8
8
  # PARAMATRIZE: Plugin class name
9
9
  class Slashdot < Plugin
10
- @@URL = "http://slashdot.org/"
11
-
12
10
  @@MATCHER_ITEM = "//article/header/h2/span/a"
13
11
 
14
12
  def name
15
13
  return "Slashdot"
16
14
  end
17
15
 
18
- def run()
19
- ic = Iconv.new("UTF-8//IGNORE", "UTF-8")
20
-
21
- # TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
22
- doc = Nokogiri::HTML(ic.iconv(open(@@URL).read))
16
+ def url()
17
+ return"http://slashdot.org/"
18
+ end
23
19
 
20
+ def extract_data(doc)
24
21
  res = doc.xpath(@@MATCHER_ITEM).map { |i|
25
22
  {
26
23
  :text => i.text,
27
- :link => URI.join(@@URL, i['href'])
24
+ :link => URI.join(self.url, i['href'])
28
25
  }
29
26
  }
30
-
31
- return {
32
- :plugin => self.class.name,
33
- :title => doc.title,
34
- :res => res
35
- }
36
27
  end
37
28
  end
38
29
  end # Plugins
@@ -7,32 +7,23 @@ module Apollo
7
7
  module Plugins
8
8
  # PARAMATRIZE: Plugin class name
9
9
  class Xkcd < Plugin
10
- @@URL = "http://xkcd.com/"
11
-
12
10
  @@MATCHER_ITEM = "//div[@id = 'comic']/img"
13
11
 
14
12
  def name()
15
13
  return "Xkcd"
16
14
  end
17
15
 
18
- def run()
19
- ic = Iconv.new("UTF-8//IGNORE", "UTF-8")
16
+ def url()
17
+ return "http://xkcd.com/"
18
+ end
20
19
 
21
- # TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
22
- doc = Nokogiri::HTML(ic.iconv(open(@@URL).read))
23
-
20
+ def extract_data(doc)
24
21
  res = doc.xpath(@@MATCHER_ITEM).map { |node|
25
22
  {
26
23
  :text => node['title'],
27
- :link => URI.join(@@URL, node['src'])
24
+ :link => URI.join(self.url, node['src'])
28
25
  }
29
26
  }
30
-
31
- return {
32
- :plugin => self.class.name,
33
- :title => doc.title,
34
- :res => res
35
- }
36
27
  end
37
28
  end
38
29
  end # Plugins
@@ -7,32 +7,23 @@ module Apollo
7
7
  module Plugins
8
8
  # PARAMATRIZE: Plugin class name
9
9
  class HackerNews < Plugin
10
- @@URL = "http://news.ycombinator.com/"
11
-
12
10
  @@MATCHER_ITEM = "//td[@class = 'title']/a"
13
11
 
14
12
  def name
15
13
  return "Hacker News"
16
14
  end
17
15
 
18
- def run()
19
- ic = Iconv.new("UTF-8//IGNORE", "UTF-8")
20
-
21
- # TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
22
- doc = Nokogiri::HTML(ic.iconv(open(@@URL).read))
16
+ def url()
17
+ return "http://news.ycombinator.com/"
18
+ end
23
19
 
20
+ def extract_data(doc)
24
21
  res = doc.xpath(@@MATCHER_ITEM).map { |i|
25
22
  {
26
23
  :text => i.text,
27
- :link => URI.join(@@URL, i['href'])
24
+ :link => URI.join(self.url, i['href'])
28
25
  }
29
26
  }
30
-
31
- return {
32
- :plugin => self.class.name,
33
- :title => doc.title,
34
- :res => res
35
- }
36
27
  end
37
28
  end
38
29
  end # Plugins
@@ -1,5 +1,5 @@
1
1
  module Apollo
2
2
  module Crawler
3
- VERSION = '0.0.35'
3
+ VERSION = '0.0.36'
4
4
  end # Crawler
5
5
  end # Apollo
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: apollo-crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.35
4
+ version: 0.0.36
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors: