apollo-crawler 0.0.36 → 0.0.37
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/apollo-crawler +11 -11
- data/lib/apollo_crawler/plugin.rb +14 -21
- data/lib/apollo_crawler/version.rb +1 -1
- metadata +17 -1
data/bin/apollo-crawler
CHANGED
@@ -18,6 +18,7 @@ require "optparse"
|
|
18
18
|
require 'active_support'
|
19
19
|
require 'active_support/inflector'
|
20
20
|
|
21
|
+
require 'terminal-table'
|
21
22
|
|
22
23
|
require File.join(File.dirname(__FILE__), '..', 'lib', 'apollo_crawler', 'version')
|
23
24
|
|
@@ -219,16 +220,12 @@ module Crawler
|
|
219
220
|
end
|
220
221
|
|
221
222
|
if(@options[:list_plugins])
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
puts "(#{i}) #{plugin} - #{instance.name}"
|
229
|
-
i += 1
|
230
|
-
end
|
231
|
-
puts "----------------------------------------"
|
223
|
+
headings = ['name', 'class']
|
224
|
+
rows = @plugins
|
225
|
+
|
226
|
+
table = Terminal::Table.new :headings => headings, :rows => rows
|
227
|
+
|
228
|
+
puts table
|
232
229
|
return
|
233
230
|
end
|
234
231
|
|
@@ -250,7 +247,10 @@ module Crawler
|
|
250
247
|
next
|
251
248
|
end
|
252
249
|
|
253
|
-
|
250
|
+
if(@options[:verbose])
|
251
|
+
puts "Running '#{plugin}'"
|
252
|
+
end
|
253
|
+
|
254
254
|
res = p.new.etl
|
255
255
|
if(res.nil?)
|
256
256
|
next
|
@@ -15,38 +15,31 @@ module Apollo
|
|
15
15
|
return nil
|
16
16
|
end
|
17
17
|
|
18
|
-
def etl()
|
19
|
-
#
|
20
|
-
|
21
|
-
|
22
|
-
if(self.url.nil?)
|
18
|
+
def etl(url=nil)
|
19
|
+
# Look for passed URL use default instead and fail if it is not valid
|
20
|
+
url = url ? url : self.url
|
21
|
+
if(url.nil?)
|
23
22
|
return nil
|
24
23
|
end
|
25
24
|
|
26
|
-
|
25
|
+
# Try fetch document
|
26
|
+
doc = self.fetch_document(url)
|
27
27
|
if(doc.nil?)
|
28
28
|
return nil
|
29
29
|
end
|
30
30
|
|
31
|
-
|
31
|
+
# Try extract data from document
|
32
|
+
data = self.extract_data(doc)
|
33
|
+
|
34
|
+
# Try extract links for another documents
|
35
|
+
links = self.extract_links(doc)
|
32
36
|
|
37
|
+
# Return ETL result
|
33
38
|
return {
|
34
39
|
:plugin => self.class.name,
|
35
40
|
:title => doc.title,
|
36
|
-
:data =>
|
37
|
-
|
38
|
-
end
|
39
|
-
|
40
|
-
# - Fetch default URL (and transform it to document)
|
41
|
-
# - Extract and Load (Store) important data
|
42
|
-
# - Look for another documents
|
43
|
-
# Examples:
|
44
|
-
# - "next page"
|
45
|
-
# - "people you may know on Linked in"
|
46
|
-
# - "will attend on FB")
|
47
|
-
def run
|
48
|
-
return {
|
49
|
-
:plugin => self.class.name
|
41
|
+
:data => data,
|
42
|
+
:links => links
|
50
43
|
}
|
51
44
|
end
|
52
45
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: apollo-crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.37
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -123,6 +123,22 @@ dependencies:
|
|
123
123
|
- - ! '>='
|
124
124
|
- !ruby/object:Gem::Version
|
125
125
|
version: '0'
|
126
|
+
- !ruby/object:Gem::Dependency
|
127
|
+
name: terminal-table
|
128
|
+
requirement: !ruby/object:Gem::Requirement
|
129
|
+
none: false
|
130
|
+
requirements:
|
131
|
+
- - ! '>='
|
132
|
+
- !ruby/object:Gem::Version
|
133
|
+
version: '0'
|
134
|
+
type: :runtime
|
135
|
+
prerelease: false
|
136
|
+
version_requirements: !ruby/object:Gem::Requirement
|
137
|
+
none: false
|
138
|
+
requirements:
|
139
|
+
- - ! '>='
|
140
|
+
- !ruby/object:Gem::Version
|
141
|
+
version: '0'
|
126
142
|
- !ruby/object:Gem::Dependency
|
127
143
|
name: thor
|
128
144
|
requirement: !ruby/object:Gem::Requirement
|