apollo-crawler 0.0.36 → 0.0.37

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/bin/apollo-crawler CHANGED
@@ -18,6 +18,7 @@ require "optparse"
18
18
  require 'active_support'
19
19
  require 'active_support/inflector'
20
20
 
21
+ require 'terminal-table'
21
22
 
22
23
  require File.join(File.dirname(__FILE__), '..', 'lib', 'apollo_crawler', 'version')
23
24
 
@@ -219,16 +220,12 @@ module Crawler
219
220
  end
220
221
 
221
222
  if(@options[:list_plugins])
222
- puts "Listing plugins"
223
- puts "----------------------------------------"
224
- i = 0
225
- @plugins.sort.each do |plugin, klass|
226
- instance = klass.new
227
- # puts klass.class_eval("@@NAME")
228
- puts "(#{i}) #{plugin} - #{instance.name}"
229
- i += 1
230
- end
231
- puts "----------------------------------------"
223
+ headings = ['name', 'class']
224
+ rows = @plugins
225
+
226
+ table = Terminal::Table.new :headings => headings, :rows => rows
227
+
228
+ puts table
232
229
  return
233
230
  end
234
231
 
@@ -250,7 +247,10 @@ module Crawler
250
247
  next
251
248
  end
252
249
 
253
- # puts "Running '#{plugin}'"
250
+ if(@options[:verbose])
251
+ puts "Running '#{plugin}'"
252
+ end
253
+
254
254
  res = p.new.etl
255
255
  if(res.nil?)
256
256
  next
@@ -15,38 +15,31 @@ module Apollo
15
15
  return nil
16
16
  end
17
17
 
18
- def etl()
19
- #return run()
20
- res = []
21
-
22
- if(self.url.nil?)
18
+ def etl(url=nil)
19
+ # Look for passed URL use default instead and fail if it is not valid
20
+ url = url ? url : self.url
21
+ if(url.nil?)
23
22
  return nil
24
23
  end
25
24
 
26
- doc = self.fetch_document(self.url)
25
+ # Try fetch document
26
+ doc = self.fetch_document(url)
27
27
  if(doc.nil?)
28
28
  return nil
29
29
  end
30
30
 
31
- res = self.extract_data(doc)
31
+ # Try extract data from document
32
+ data = self.extract_data(doc)
33
+
34
+ # Try extract links for another documents
35
+ links = self.extract_links(doc)
32
36
 
37
+ # Return ETL result
33
38
  return {
34
39
  :plugin => self.class.name,
35
40
  :title => doc.title,
36
- :data => res
37
- }
38
- end
39
-
40
- # - Fetch default URL (and transform it to document)
41
- # - Extract and Load (Store) important data
42
- # - Look for another documents
43
- # Examples:
44
- # - "next page"
45
- # - "people you may know on Linked in"
46
- # - "will attend on FB")
47
- def run
48
- return {
49
- :plugin => self.class.name
41
+ :data => data,
42
+ :links => links
50
43
  }
51
44
  end
52
45
 
@@ -1,5 +1,5 @@
1
1
  module Apollo
2
2
  module Crawler
3
- VERSION = '0.0.36'
3
+ VERSION = '0.0.37'
4
4
  end # Crawler
5
5
  end # Apollo
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: apollo-crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.36
4
+ version: 0.0.37
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -123,6 +123,22 @@ dependencies:
123
123
  - - ! '>='
124
124
  - !ruby/object:Gem::Version
125
125
  version: '0'
126
+ - !ruby/object:Gem::Dependency
127
+ name: terminal-table
128
+ requirement: !ruby/object:Gem::Requirement
129
+ none: false
130
+ requirements:
131
+ - - ! '>='
132
+ - !ruby/object:Gem::Version
133
+ version: '0'
134
+ type: :runtime
135
+ prerelease: false
136
+ version_requirements: !ruby/object:Gem::Requirement
137
+ none: false
138
+ requirements:
139
+ - - ! '>='
140
+ - !ruby/object:Gem::Version
141
+ version: '0'
126
142
  - !ruby/object:Gem::Dependency
127
143
  name: thor
128
144
  requirement: !ruby/object:Gem::Requirement