apollo-crawler 0.0.17 → 0.0.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/apollo-crawler +168 -10
- data/lib/apollo_crawler/version.rb +1 -1
- metadata +1 -2
- data/main.rb +0 -170
data/bin/apollo-crawler
CHANGED
@@ -1,12 +1,170 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
|
3
|
+
require "rubygems"
|
4
|
+
require "bundler/setup"
|
5
|
+
|
6
|
+
require 'json'
|
7
|
+
|
8
|
+
require "thor"
|
9
|
+
|
10
|
+
require "open-uri"
|
11
|
+
require "nokogiri"
|
12
|
+
|
13
|
+
require "pp"
|
14
|
+
require "optparse"
|
15
|
+
|
16
|
+
module Crawler
|
17
|
+
class Program
|
18
|
+
# This hash will hold all of the options
|
19
|
+
# parsed from the command-line by
|
20
|
+
# OptionParser.
|
21
|
+
@options = nil
|
22
|
+
@optparser = nil
|
23
|
+
@plugins = nil
|
24
|
+
|
25
|
+
# Initializer - Constructor
|
26
|
+
def initialize
|
27
|
+
@plugins = {}
|
28
|
+
end
|
29
|
+
|
30
|
+
# Initialize command-line options
|
31
|
+
def init_options
|
32
|
+
@options = {}
|
33
|
+
@options[:verbose] = false
|
34
|
+
|
35
|
+
@optparser = OptionParser.new do | opts |
|
36
|
+
# This displays the help screen, all programs are
|
37
|
+
# assumed to have this option.
|
38
|
+
opts.on('-h', '--help', 'Display this screen') do
|
39
|
+
puts opts
|
40
|
+
exit
|
41
|
+
end
|
42
|
+
|
43
|
+
opts.on('-a', '--all', 'Run all plugins') do
|
44
|
+
@options[:run_all] = true
|
45
|
+
end
|
46
|
+
|
47
|
+
opts.on('-v', '--verbose', 'Enable verbose output') do
|
48
|
+
@options[:verbose] = true
|
49
|
+
end
|
50
|
+
|
51
|
+
opts.on('-l', '--list-plugins', 'List of plugins') do
|
52
|
+
@options[:list_plugins] = true
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
# Parse the options passed to command-line
|
58
|
+
def parse_options
|
59
|
+
# Parse the command-line. Remember there are two forms
|
60
|
+
# of the parse method. The 'parse' method simply parses
|
61
|
+
# ARGV, while the 'parse!' method parses ARGV and removes
|
62
|
+
# any options found there, as well as any parameters for
|
63
|
+
# the options. What's left is the list of files to resize.
|
64
|
+
@optparser.parse!
|
65
|
+
end
|
66
|
+
|
67
|
+
# Load global options first
|
68
|
+
# Merge it with local options (if they exists)
|
69
|
+
def load_config_file()
|
70
|
+
config = File.join(File.dirname(__FILE__), "config", "crawler.rb")
|
71
|
+
puts "Inspecting #{config} ..."
|
72
|
+
if(File.exists?(config))
|
73
|
+
if(@options[:verbose])
|
74
|
+
puts "Loading config '#{config}'"
|
75
|
+
end
|
76
|
+
|
77
|
+
# puts "Let's require '#{@options[:verbose]}'"
|
78
|
+
require config
|
79
|
+
else
|
80
|
+
if(@options[:verbose])
|
81
|
+
# TODO: Add support for initial rake task generation
|
82
|
+
# Something like this:
|
83
|
+
# rake config:init # Initializes config files with
|
84
|
+
# their defaults (if not exists already)
|
85
|
+
puts "Default config does not exist, skipping - '#{config}'"
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
# Register plugins (specific crawlers)
|
91
|
+
def register_plugins()
|
92
|
+
dir = File.join(File.dirname(__FILE__), "..", "lib", "apollo_crawler", "plugins")
|
93
|
+
if(@options[:verbose])
|
94
|
+
puts "Registering plugins - '#{dir}'"
|
95
|
+
end
|
96
|
+
|
97
|
+
sites = File.join(dir, "**", "*.rb")
|
98
|
+
Dir.glob(sites).each do |site|
|
99
|
+
require site
|
100
|
+
end
|
101
|
+
|
102
|
+
tmp = Apollo::Crawler::Plugins.constants.select { |c|
|
103
|
+
Class === Apollo::Crawler::Plugins.const_get(c)
|
104
|
+
}
|
105
|
+
|
106
|
+
tmp.each do |x|
|
107
|
+
klass = Object.const_get('Apollo').const_get('Crawler').const_get('Plugins').const_get(x)
|
108
|
+
@plugins.merge!({ x.downcase.to_s => klass})
|
109
|
+
end
|
110
|
+
|
111
|
+
if(@options[:verbose])
|
112
|
+
@plugins.each do |plugin, klass|
|
113
|
+
name = klass.new.class.name
|
114
|
+
|
115
|
+
if name == "Apollo::Crawler::Plugins::Plugin"
|
116
|
+
next
|
117
|
+
end
|
118
|
+
|
119
|
+
puts "Registered '#{plugin}' -> '#{name}'"
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
def run
|
125
|
+
init_options()
|
126
|
+
|
127
|
+
load_config_file()
|
128
|
+
|
129
|
+
parse_options()
|
130
|
+
|
131
|
+
# Register sites which can be crawled
|
132
|
+
register_plugins()
|
133
|
+
|
134
|
+
if(@options[:list_plugins])
|
135
|
+
puts "Listing plugins"
|
136
|
+
puts "----------------------------------------"
|
137
|
+
i = 0
|
138
|
+
@plugins.sort.each do |plugin, klass|
|
139
|
+
instance = klass.new
|
140
|
+
# puts klass.class_eval("@@NAME")
|
141
|
+
puts "(#{i}) #{plugin} - #{instance.name}"
|
142
|
+
i += 1
|
143
|
+
end
|
144
|
+
puts "----------------------------------------"
|
145
|
+
return
|
146
|
+
end
|
147
|
+
|
148
|
+
plugins = ARGV
|
149
|
+
|
150
|
+
if(@options[:run_all])
|
151
|
+
plugins = @plugins.keys
|
152
|
+
end
|
153
|
+
|
154
|
+
if(plugins.empty?)
|
155
|
+
puts @optparser
|
156
|
+
end
|
157
|
+
|
158
|
+
plugins.each do |plugin|
|
159
|
+
p = @plugins[plugin.downcase].new
|
160
|
+
|
161
|
+
# puts "Running '#{plugin}'"
|
162
|
+
puts JSON.pretty_generate(p.run)
|
163
|
+
end
|
164
|
+
end
|
165
|
+
end
|
10
166
|
end
|
11
167
|
|
12
|
-
|
168
|
+
if __FILE__ == $0
|
169
|
+
Crawler::Program.new.run()
|
170
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: apollo-crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.18
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -18,7 +18,6 @@ executables:
|
|
18
18
|
extensions: []
|
19
19
|
extra_rdoc_files: []
|
20
20
|
files:
|
21
|
-
- ./main.rb
|
22
21
|
- ./lib/apollo_crawler/version.rb
|
23
22
|
- ./lib/apollo_crawler/crawler.rb
|
24
23
|
- ./lib/apollo_crawler/plugins/slashdot_org/slashdot.rb
|
data/main.rb
DELETED
@@ -1,170 +0,0 @@
|
|
1
|
-
#! /usr/bin/env ruby
|
2
|
-
|
3
|
-
require "rubygems"
|
4
|
-
require "bundler/setup"
|
5
|
-
|
6
|
-
require 'json'
|
7
|
-
|
8
|
-
require "thor"
|
9
|
-
|
10
|
-
require "open-uri"
|
11
|
-
require "nokogiri"
|
12
|
-
|
13
|
-
require "pp"
|
14
|
-
require "optparse"
|
15
|
-
|
16
|
-
module Crawler
|
17
|
-
class Program
|
18
|
-
# This hash will hold all of the options
|
19
|
-
# parsed from the command-line by
|
20
|
-
# OptionParser.
|
21
|
-
@options = nil
|
22
|
-
@optparser = nil
|
23
|
-
@plugins = nil
|
24
|
-
|
25
|
-
# Initializer - Constructor
|
26
|
-
def initialize
|
27
|
-
@plugins = {}
|
28
|
-
end
|
29
|
-
|
30
|
-
# Initialize command-line options
|
31
|
-
def init_options
|
32
|
-
@options = {}
|
33
|
-
@options[:verbose] = false
|
34
|
-
|
35
|
-
@optparser = OptionParser.new do | opts |
|
36
|
-
# This displays the help screen, all programs are
|
37
|
-
# assumed to have this option.
|
38
|
-
opts.on('-h', '--help', 'Display this screen') do
|
39
|
-
puts opts
|
40
|
-
exit
|
41
|
-
end
|
42
|
-
|
43
|
-
opts.on('-a', '--all', 'Run all plugins') do
|
44
|
-
@options[:run_all] = true
|
45
|
-
end
|
46
|
-
|
47
|
-
opts.on('-v', '--verbose', 'Enable verbose output') do
|
48
|
-
@options[:verbose] = true
|
49
|
-
end
|
50
|
-
|
51
|
-
opts.on('-l', '--list-plugins', 'List of plugins') do
|
52
|
-
@options[:list_plugins] = true
|
53
|
-
end
|
54
|
-
end
|
55
|
-
end
|
56
|
-
|
57
|
-
# Parse the options passed to command-line
|
58
|
-
def parse_options
|
59
|
-
# Parse the command-line. Remember there are two forms
|
60
|
-
# of the parse method. The 'parse' method simply parses
|
61
|
-
# ARGV, while the 'parse!' method parses ARGV and removes
|
62
|
-
# any options found there, as well as any parameters for
|
63
|
-
# the options. What's left is the list of files to resize.
|
64
|
-
@optparser.parse!
|
65
|
-
end
|
66
|
-
|
67
|
-
# Load global options first
|
68
|
-
# Merge it with local options (if they exists)
|
69
|
-
def load_config_file()
|
70
|
-
config = File.join(File.dirname(__FILE__), "config", "crawler.rb")
|
71
|
-
puts "Inspecting #{config} ..."
|
72
|
-
if(File.exists?(config))
|
73
|
-
if(@options[:verbose])
|
74
|
-
puts "Loading config '#{config}'"
|
75
|
-
end
|
76
|
-
|
77
|
-
# puts "Let's require '#{@options[:verbose]}'"
|
78
|
-
require config
|
79
|
-
else
|
80
|
-
if(@options[:verbose])
|
81
|
-
# TODO: Add support for initial rake task generation
|
82
|
-
# Something like this:
|
83
|
-
# rake config:init # Initializes config files with
|
84
|
-
# their defaults (if not exists already)
|
85
|
-
puts "Default config does not exist, skipping - '#{config}'"
|
86
|
-
end
|
87
|
-
end
|
88
|
-
end
|
89
|
-
|
90
|
-
# Register plugins (specific crawlers)
|
91
|
-
def register_plugins()
|
92
|
-
dir = File.join(File.dirname(__FILE__), "lib", "apollo_crawler", "plugins")
|
93
|
-
if(@options[:verbose])
|
94
|
-
puts "Registering plugins - '#{dir}'"
|
95
|
-
end
|
96
|
-
|
97
|
-
sites = File.join(dir, "**", "*.rb")
|
98
|
-
Dir.glob(sites).each do |site|
|
99
|
-
require site
|
100
|
-
end
|
101
|
-
|
102
|
-
tmp = Apollo::Crawler::Plugins.constants.select { |c|
|
103
|
-
Class === Apollo::Crawler::Plugins.const_get(c)
|
104
|
-
}
|
105
|
-
|
106
|
-
tmp.each do |x|
|
107
|
-
klass = Object.const_get('Apollo').const_get('Crawler').const_get('Plugins').const_get(x)
|
108
|
-
@plugins.merge!({ x.downcase.to_s => klass})
|
109
|
-
end
|
110
|
-
|
111
|
-
if(@options[:verbose])
|
112
|
-
@plugins.each do |plugin, klass|
|
113
|
-
name = klass.new.class.name
|
114
|
-
|
115
|
-
if name == "Apollo::Crawler::Plugins::Plugin"
|
116
|
-
next
|
117
|
-
end
|
118
|
-
|
119
|
-
puts "Registered '#{plugin}' -> '#{name}'"
|
120
|
-
end
|
121
|
-
end
|
122
|
-
end
|
123
|
-
|
124
|
-
def run
|
125
|
-
init_options()
|
126
|
-
|
127
|
-
load_config_file()
|
128
|
-
|
129
|
-
parse_options()
|
130
|
-
|
131
|
-
# Register sites which can be crawled
|
132
|
-
register_plugins()
|
133
|
-
|
134
|
-
if(@options[:list_plugins])
|
135
|
-
puts "Listing plugins"
|
136
|
-
puts "----------------------------------------"
|
137
|
-
i = 0
|
138
|
-
@plugins.sort.each do |plugin, klass|
|
139
|
-
instance = klass.new
|
140
|
-
# puts klass.class_eval("@@NAME")
|
141
|
-
puts "(#{i}) #{plugin} - #{instance.name}"
|
142
|
-
i += 1
|
143
|
-
end
|
144
|
-
puts "----------------------------------------"
|
145
|
-
return
|
146
|
-
end
|
147
|
-
|
148
|
-
plugins = ARGV
|
149
|
-
|
150
|
-
if(@options[:run_all])
|
151
|
-
plugins = @plugins.keys
|
152
|
-
end
|
153
|
-
|
154
|
-
if(plugins.empty?)
|
155
|
-
puts @optparser
|
156
|
-
end
|
157
|
-
|
158
|
-
plugins.each do |plugin|
|
159
|
-
p = @plugins[plugin.downcase].new
|
160
|
-
|
161
|
-
# puts "Running '#{plugin}'"
|
162
|
-
puts JSON.pretty_generate(p.run)
|
163
|
-
end
|
164
|
-
end
|
165
|
-
end
|
166
|
-
end
|
167
|
-
|
168
|
-
if __FILE__ == $0
|
169
|
-
Crawler::Program.new.run()
|
170
|
-
end
|