kitcrawler 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,48 @@
1
+ KITCrawler
2
+ ===================
3
+ Fetch lecture PDFs with ease.
4
+
5
+
6
+ It currently supports crawling PDFs for lectures from the studium.kit.edu page,
7
+ but can be easily extended to fetch PDFs from other services.
8
+
9
+ Requirements
10
+ -------------------
11
+ - ruby (>= 1.9, but 1.8 might also be okay)
12
+ - bundler (or install the required gems (see `Gemfile`) manually)
13
+ - linux (with curl, might also work on other Unixes)
14
+
15
+ Install
16
+ -------------------
17
+ Simply run
18
+ ```
19
+ gem install kitcrawler
20
+ ```
21
+ to install the gem (it's often a bit behind the repo).
22
+
23
+ Or run it from source.
24
+ ```
25
+ git clone https://github.com/parttimenerd/KITCrawler
26
+ cd KITCrawler
27
+ bundle install
28
+ ``
29
+
30
+
31
+ Usage
32
+ -------------------
33
+ Run
34
+ ```
35
+ kitcrawler add NAME
36
+ ```
37
+ to add a new fetch job named `NAME`. This will prompt you to pass an entry URL to the site, etc.
38
+
39
+ To finally run your jobs use
40
+ ```
41
+ kitcrawler fetch
42
+ ```
43
+
44
+ It also supports some command line parameters, run `kitcrawler` to see an explanation.
45
+
46
+ License
47
+ -------------------
48
+ The code is GNU GPL v3 licensed.
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ #require 'kitcrawler'
4
+ require_relative '../lib/kitcrawler.rb'
5
+ KITCrawler::run_cli
@@ -0,0 +1,162 @@
1
+ require 'pp'
2
+ require 'json'
3
+ require 'thor'
4
+ require_relative 'services.rb'
5
+ require_relative 'cli_add.rb'
6
+
7
+ def kitgrawler_dir
8
+ `echo ~/.kitcrawler`.strip
9
+ end
10
+
11
+ class CLI < Thor
12
+ class_option :config_file, :type => :string, :aliases => "-c",
13
+ :default => "#{kitgrawler_dir}/config.json",
14
+ :desc => "Use CONFIG_FILE as config file location"
15
+ class_option :auth_file, :type => :string, :aliases => "-a",
16
+ :default => "#{kitgrawler_dir}/auth.json",
17
+ :desc => "Use AUTH_FILE as authentication location"
18
+ class_option :url_cache_file, :type => :string, :aliases => "-u",
19
+ :default => "#{kitgrawler_dir}/url_type_cache.json",
20
+ :desc => "Use URL_CACHE_FILE as url cache file location"
21
+ class_option :debug, :type => :boolean, :default => false, :aliases => "-d",
22
+ :desc => "Print everything to standard out"
23
+ class_option :verbose, :type => :boolean, :default => false, :aliases => "-v",
24
+ :desc => "Print a lot to standard out"
25
+ class_option :warn, :type => :boolean, :default => false, :aliases => "-w",
26
+ :desc => "Print only warnings and errors to standard out"
27
+ class_option :quiet, :type => :boolean, :default => false, :aliases => "-q",
28
+ :desc => "Print nothing to standard out"
29
+
30
+ desc "fetch [NAME]", "Run fetch job NAME or all jobs"
31
+ def fetch name
32
+ if name == nil
33
+ CLIHelper.new(options).fetch_all
34
+ else
35
+ CLIHelper.new(options).fetch name
36
+ end
37
+ end
38
+
39
+ desc "add NAME", "Add new job NAME"
40
+ def add name
41
+ CLI_ADD.add_config_ui name, options
42
+ end
43
+
44
+ end
45
+
46
+ class CLIHelper
47
+
48
+ def initialize options
49
+ @options = options
50
+ `mkdir -p #{kitgrawler_dir}`
51
+ init_logger
52
+ load_conf_file
53
+ load_auth_file
54
+ load_url_type_cache_file
55
+ end
56
+
57
+ def init_logger
58
+ @log = Logger.new STDOUT
59
+ @log_level = Logger::WARN
60
+ @log_level = Logger::DEBUG if @options[:debug]
61
+ @log_level = Logger::INFO if @options[:verbose]
62
+ @log_level = Logger::WARN if @options[:warn]
63
+ @log_level = Logger::UNKOWN if @options[:quiet]
64
+ @log.level = @log_level
65
+ @log.progname = "cli"
66
+ end
67
+
68
+ def load_conf_file
69
+ @conf = {}
70
+ return unless File.exists?(@options[:config_file])
71
+ begin
72
+ @conf = JSON.load File.read(@options[:config_file])
73
+ rescue => ex
74
+ @log.fatal "Cannot load config file #{@options[:config_file]}"
75
+ @log.fatal ex
76
+ exit 1
77
+ end
78
+ end
79
+
80
+ def load_auth_file
81
+ @auth_conf = {}
82
+ return unless File.exists?(@options[:auth_file])
83
+ begin
84
+ @auth_conf = JSON.load File.read(@options[:auth_file])
85
+ rescue => ex
86
+ @log.fatal "Cannot load authentication config file #{@options[:auth_file]}"
87
+ @log.fatal ex
88
+ exit 1
89
+ end
90
+ end
91
+
92
+ def load_url_type_cache_file
93
+ @url_type_cache = {}
94
+ return unless File.exists?(@options[:url_cache_file])
95
+ begin
96
+ @url_type_cache = JSON.load File.read(@options[:url_cache_file])
97
+ rescue => ex
98
+ @log.fatal "Cannot load url type cache file #{@options[:url_cache_file]}"
99
+ @log.fatal ex
100
+ exit 1
101
+ end
102
+ end
103
+
104
+ def fetch_all
105
+ begin
106
+ @conf.each_key do |grawl_job|
107
+ fetch grawl_job
108
+ end
109
+ rescue => ex
110
+ @log.fatal "Error grawling configured locations"
111
+ @log.fatal ex
112
+ exit 1
113
+ ensure
114
+ File.open(@options[:url_cache_file], "w") do |f|
115
+ f.puts JSON::pretty_generate @url_type_cache
116
+ end
117
+ end
118
+ end
119
+
120
+ def fetch job_name
121
+ grawl_location = job_name
122
+ conf = @conf[job_name]
123
+ if conf == nil
124
+ print_job_name_guess job_name
125
+ return
126
+ end
127
+ begin
128
+ service = BaseService::get_service grawl_location, conf, @auth_conf, @log_level, @url_type_cache
129
+ begin
130
+ service.execute
131
+ rescue => ex
132
+ @log.error "Failed executing #{grawl_location}"
133
+ @log.error ex
134
+ end
135
+ rescue => ex
136
+ @log.error "Failed to instantiate #{grawl_location}"
137
+ @log.error ex
138
+ end
139
+ end
140
+
141
+ def print_job_name_guess job_name
142
+ unless @options[:quiet]
143
+ puts "There is no job '#{job_name}'."
144
+ puts "Maybe you meant one of the following"
145
+ best_n_matches(@conf.keys, job_name, 3).each do |name|
146
+ puts " #{name}"
147
+ end
148
+ end
149
+ end
150
+
151
+ def best_n_matches arr, comp, n
152
+ require 'damerau-levenshtein'
153
+ map = {}
154
+ dl = DamerauLevenshtein
155
+ arr.each do |str|
156
+ map[str] = dl.distance(str, comp, 2)
157
+ end
158
+ return arr.sort {|a, b| map[a] <=> map[b] }
159
+ end
160
+ end
161
+
162
+
@@ -0,0 +1,151 @@
1
+ require 'pp'
2
+ require 'json'
3
+ require_relative 'services.rb'
4
+
5
+ module CLI_ADD
6
+
7
+ require 'highline/import'
8
+
9
+ def self.add_config_ui name, options
10
+ CLIHelper.new options
11
+ config_json = JSON.load File.read(options[:config_file])
12
+ auth_json = JSON.load File.read(options[:auth_file])
13
+ say("Configure #{name}")
14
+ name = check_name name, config_json
15
+ conf = {}
16
+ conf["entry_url"] = ask_entry_url
17
+ conf["type"] = ask_type conf
18
+ conf["pdfs"] = ask_pdfs conf
19
+ if BaseService.get_services["needs_auth"]
20
+ conf["auth"] = ask_auth name, conf, auth_json
21
+ end
22
+ config_json[name] = conf
23
+ say "This configuration is placed into your config files."
24
+ say "Your config file is #{options[:config_file]}."
25
+ say "Your authentication config file is #{options[:auth_file]}."
26
+ File.open(options[:config_file], "w") do |f|
27
+ f.puts JSON.pretty_generate config_json
28
+ end
29
+ File.open(options[:auth_file], "w") do |f|
30
+ f.puts JSON.pretty_generate auth_json
31
+ end
32
+ end
33
+
34
+ def self.check_name name, config_json
35
+ names = config_json.keys
36
+ while names.include? name
37
+ name = ask "Fetch job name (#{name} is already in use)? "
38
+ end
39
+ return name
40
+ end
41
+
42
+ def self.ask_entry_url
43
+ return ask_url "Entry point url? "
44
+ end
45
+
46
+ def self.ask_type conf
47
+ default = BaseService::get_service_for_url conf["entry_url"]
48
+ choose do |menu|
49
+ menu.prompt = "Service type [#{default}]? "
50
+ menu.default = default
51
+ BaseService::get_services.each do |name, service|
52
+ menu.choices("#{name} (#{service["description"]})") do |q|
53
+ say "You've choosen '#{name}'."
54
+ return name
55
+ end
56
+ end
57
+ end
58
+ end
59
+
60
+ def self.ask_pdfs conf
61
+ return {
62
+ "src_folder" => ask_non_empty("Source folder url (relative to entry url directory if starts with dot)? "),
63
+ "dest_folder" => ask_non_empty("Destination folder (relative to $HOME)? "),
64
+ "download_once" => ask_yes_no(
65
+ "Dowload a PDF only once (ignore changes, boost performance)? ", "yes"
66
+ ) == "yes"
67
+ }
68
+ end
69
+
70
+ def self.ask_auth name, conf, auth_json
71
+ is_studium_kit = conf["type"] == "studium_kit"
72
+ has_s_kit_auth = auth_json["studium_kit"] != nil
73
+ has_name_auth = auth_json[name] != nil
74
+ default = ""
75
+ if ask_yes_no("Auth: Use existing user/password configuration? ", is_studium_kit && has_s_kit_auth ? "yes" : "no") == "yes"
76
+ if is_studium_kit && has_s_kit_auth
77
+ default = "studium_kit"
78
+ end
79
+ choose do |menu|
80
+ menu.prompt = "Auth: Which configuration? "
81
+ menu.default = default unless default.empty?
82
+ auth_json.each do |name, config|
83
+ menu.choices("#{name} (user: #{config["user"]})") do |q|
84
+ say "Auth: You've chosen '#{name}'."
85
+ return name
86
+ end
87
+ end
88
+ end
89
+ else
90
+ auths = auth_json.keys
91
+ auth_name = ""
92
+ if is_studium_kit && !has_s_kit_auth
93
+ default = "studium_kit"
94
+ auth_name = ask "Auth: Configuration name [studium_kit]? " do |q|
95
+ q.default = studium_kit
96
+ end
97
+ elsif not has_name_auth
98
+ auth_name = ask "Auth: Configuration name [#{name}]? " do |q|
99
+ q.default = name
100
+ end
101
+ else
102
+ auth_name = ask_non_empty "Auth: Configuration name? "
103
+ end
104
+ begin
105
+ auth_json[auth_name] = ask_user_pass
106
+ end while ask_yes_no("Auth: Confirm that you're credentials are right. Are they? ", "yes") == "no"
107
+ return auth_name
108
+ end
109
+ end
110
+
111
+ def self.ask_user_pass
112
+ user = ask "Auth: User name? "
113
+ pass = ""
114
+ pass2 = ""
115
+ begin
116
+ pass = ask("Auth: Password? ") { |q| q.echo = "x" }
117
+ pass2 = ask("Auth: Retype it ") { |q| q.echo = "x" }
118
+ end while pass != pass2
119
+ return {
120
+ "user" => user,
121
+ "pass" => pass
122
+ }
123
+ end
124
+
125
+ def self.ask_url question
126
+ str = ""
127
+ while str.strip.length < 4
128
+ str = ask(question) || ""
129
+ end
130
+ return str.strip
131
+ end
132
+
133
+ def self.ask_non_empty question
134
+ str = ""
135
+ while str.strip.empty?
136
+ str = ask(question) || ""
137
+ end
138
+ return str.strip
139
+ end
140
+
141
+ def self.ask_yes_no question, default = "yes"
142
+ choose do |menu|
143
+ menu.layout = :one_line
144
+ menu.prompt = "#{question} [#{default}] "
145
+ menu.default = default
146
+ menu.choices(:yes, :no) do |q|
147
+ return q.to_s
148
+ end
149
+ end
150
+ end
151
+ end
@@ -0,0 +1,16 @@
1
+ #!/bin/ruby
2
+
3
+ require 'logger'
4
+ require 'json'
5
+ require 'optparse'
6
+
7
+ module KITCrawler
8
+
9
+ require_relative 'services.rb'
10
+ require_relative 'cli_add.rb'
11
+ require_relative 'cli.rb'
12
+
13
+ def self.run_cli
14
+ CLI.start ARGV
15
+ end
16
+ end
@@ -0,0 +1,381 @@
1
+ require 'nokogiri'
2
+ require 'cgi'
3
+ require 'pp'
4
+ require 'uri'
5
+ require 'logger'
6
+ require 'set'
7
+ require 'time'
8
+ require 'uri'
9
+ require 'json'
10
+
11
+ class BaseService
12
+
13
+ @conf = {}
14
+ @auth_app = ""
15
+ @file_header_cache = {} #url => splitted HTTP header lines
16
+ @processed_pdfs = {} #url => dest file
17
+ @name = ""
18
+ @@log
19
+ @@service_classes = {}
20
+ @type_cache = {}
21
+ @uri_cache = {} #url => URI
22
+
23
+ def initialize name, conf, auth_conf = {}, log_level = Logger::WARN, url_type_cache = {}
24
+ @base_dir = `echo ~`.strip
25
+ @uri_cache = {}
26
+ @file_header_cache = {}
27
+ @type_cache = url_type_cache
28
+ @processed_pdfs = {}
29
+ @name = name
30
+ @log = Logger.new(STDOUT)
31
+ @log.progname = name
32
+ @log.level = log_level
33
+
34
+ @conf = {
35
+ "type" => "base",
36
+ "exclude_file_endings" => [".css", ".js", ".txt", ".rss", ".atom"],
37
+ "access_pause" => { #in seconds
38
+ "min" => 0.1,
39
+ "max" => 0.3
40
+ },
41
+ "pdfs" => {
42
+ "src_folder" => "abc.de/a", #is relative to entry_url base dir if starts with dot
43
+ "dest_folder" => "abcd",
44
+ "download_once" => true
45
+ },
46
+ "cookie_jar" => "cookies.txt",
47
+ "user_agent" => "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:30.0) Gecko/20100101 Firefox/30.0",
48
+ "entry_url" => "",
49
+ "auth" => "base" #references auth conf or {"user" => "", "pass" => ""}
50
+ }
51
+ temp_conf = @conf.merge conf
52
+ unless conf["pdfs"] == nil
53
+ temp_conf["pdfs"] = @conf["pdfs"].merge conf["pdfs"]
54
+ end
55
+ @conf = temp_conf
56
+ if @conf["auth"].is_a? String
57
+ @conf["auth"] = auth_conf[@conf["auth"]]
58
+ @log.debug "Load auth from auth config #{auth_conf}"
59
+ end
60
+
61
+ if @conf["pdfs"]["src_folder"].start_with? "."
62
+ entry_uri = get_uri @conf["entry_url"]
63
+ entry_path_url = entry_uri.scheme + "://" + entry_uri.host + File.dirname(entry_uri.path)
64
+ @conf["pdfs"]["src_folder"] = "#{entry_path_url}/#{@conf["pdfs"]["src_folder"]}"
65
+ @log.info "Source folder is #{@conf["pdfs"]["src_folder"]}"
66
+ end
67
+
68
+ src_url_parsed = URI.parse(@conf["pdfs"]["src_folder"])
69
+ @conf["pdfs"]["src_path"] = src_url_parsed.path
70
+ @conf["pdfs"]["src_host"] = src_url_parsed.host
71
+ @log.info "Start authentication"
72
+ authenticate
73
+ @log.info "Authentication completed"
74
+ end
75
+
76
+ def self.get_service name, conf, auth_conf={}, log_level = Logger::WARN, url_type_cache = {}
77
+ service = @@service_classes[conf["type"]]
78
+ if service == nil
79
+ raise "Unknown service #{conf["type"]}"
80
+ else
81
+ service["class"].new name, conf, auth_conf, log_level, url_type_cache
82
+ end
83
+ end
84
+
85
+ def authenticate
86
+ ""
87
+ end
88
+
89
+ def execute
90
+ @log.info "Start grawling #{@conf["entry_url"]}"
91
+ parse_html_page @conf["entry_url"]
92
+ @log.info "Completed grawling #{@conf["entry_url"]}"
93
+ end
94
+
95
+ def parse_html_page url, url_cache = Set.new
96
+ url = url_chomp url
97
+ return if url_cache.member?(url)
98
+ url_cache.add url
99
+ @log.info "Fetch and parse #{url}"
100
+ html = ""
101
+ begin
102
+ html = fetch_url url
103
+ access_pause_sleep
104
+ rescue => ex
105
+ @log.error "Cannot fetch #{url}"
106
+ @log.error ex
107
+ return
108
+ end
109
+ links = parse_html url, html
110
+ links["html"].each do |html_link|
111
+ parse_html_page html_link, url_cache
112
+ end
113
+ links["pdf"].each do |pdf_link|
114
+ process_pdf pdf_link
115
+ end
116
+ end
117
+
118
+ ##
119
+ #Executes curl to fetch the requested url
120
+ #@param url requested url
121
+ #@param output_file output destination, if nil the output gets returned by
122
+ #this method
123
+ def fetch_url url, output_file=nil, curl_params=""
124
+ curl_params = "#{@auth_app} #{curl_params} --silent --user-agent \"#{@conf["user_agent"]}\""
125
+ curl_params += " -b #{@conf["cookie_jar"]} -c #{@conf["cookie_jar"]} -L -o \"#{output_file || "-"}\" #{url}"
126
+ @log.debug "Call curl on #{url}"
127
+ @log.debug "Curl parameters '#{curl_params}'"
128
+ `cd #{@base_dir}; curl #{curl_params}`
129
+ end
130
+
131
+ def post url, params, output_file=nil, curl_params=""
132
+ param_arr = []
133
+ params.each do |key, value|
134
+ param_arr << "#{CGI::escape(key)}=#{CGI::escape(value)}"
135
+ end
136
+ param = param_arr.join "&"
137
+ begin
138
+ fetch_url url, output_file, "#{curl_params} --data \"#{param}\""
139
+ rescue => ex
140
+ @log.error "Failed to POST #{url} with data #{params}"
141
+ @log.error ex
142
+ ""
143
+ end
144
+ end
145
+
146
+ def parse_html url, html
147
+ doc = nil
148
+ links = {'pdf' => [], 'html' => []}
149
+ begin
150
+ doc = Nokogiri::HTML html
151
+ rescue => ex
152
+ @log.error "Parsing html from url #{url} failed"
153
+ return links
154
+ end
155
+ doc.css('a[href]').each do |link|
156
+ begin
157
+ link_url = url_chomp(URI.join(url, link.attributes["href"]).to_s).to_s
158
+ @log.debug "Process link #{link_url}"
159
+ if is_pdf_url link_url
160
+ links['pdf'] << link_url
161
+ @log.debug "#{link_url} is pdf"
162
+ elsif is_html_url link_url
163
+ links['html'] << link_url
164
+ @log.debug "#{link_url} is html"
165
+ end
166
+ rescue => ex
167
+ @log.debug "Omit #{link}"
168
+ end
169
+ end
170
+ return links
171
+ end
172
+
173
+ def get_field_value html, field
174
+ doc = nil
175
+ begin
176
+ doc = Nokogiri::HTML html
177
+ rescue => ex
178
+ @log.error "Parsing html failed"
179
+ @log.error ex
180
+ return ""
181
+ end
182
+ value = ""
183
+ doc.css("##{field}").each do |link|
184
+ value = link.attributes["value"].to_s
185
+ end
186
+ return value
187
+ end
188
+
189
+ def get_type url
190
+ if is_excluded url
191
+ return ""
192
+ end
193
+ if @type_cache[url] == nil
194
+ if url.upcase.end_with?(".PDF") ||
195
+ get_file_header(url)["Content-Type"].start_with?("application/pdf", "application/x-pdf")
196
+ @type_cache[url] = "pdf"
197
+ elsif get_file_header(url)["Content-Type"].start_with?("text/html")
198
+ @type_cache[url] = "html"
199
+ else
200
+ @type_cache[url] = ""
201
+ end
202
+ end
203
+ return @type_cache[url]
204
+ end
205
+
206
+ def is_pdf_url url
207
+ get_type(url) == "pdf"
208
+ end
209
+
210
+ def is_html_url url
211
+ get_type(url) == "html"
212
+ end
213
+
214
+ def is_excluded url
215
+ parsed_url = get_uri url
216
+ parsed_url.path.send(:start_with?, @conf["exclude_file_endings"]) ||
217
+ parsed_url.host != @conf["pdfs"]["src_host"] ||
218
+ !parsed_url.path.start_with?(@conf["pdfs"]["src_path"])
219
+ end
220
+
221
+ def access_pause_sleep
222
+ min = @conf["access_pause"]["min"]
223
+ max = @conf["access_pause"]["max"]
224
+ duration = Random.rand() * (max - min) + min
225
+ @log.debug "Sleep #{duration} seconds to behave a bit more human"
226
+ sleep duration
227
+ end
228
+
229
+ def get_file_header url
230
+ url = url_chomp url
231
+ if @file_header_cache[url] == nil
232
+ header = fetch_url url, "-", "-I"
233
+ lines = header.split("\r\n").map {|val| val.split(": ") }
234
+ response = {}
235
+ lines.each {|arr| response[arr[0]] = arr[1] }
236
+ @file_header_cache[url] = response
237
+ @log.info "Fetch header of #{url}"
238
+ access_pause_sleep
239
+ end
240
+ return @file_header_cache[url]
241
+ end
242
+
243
+ def get_path_url url
244
+ parsed = get_uri url
245
+ parsed.path + (parsed.query != "" ? "?#{parsed.query}": "")
246
+ end
247
+
248
+ def process_pdf url
249
+ url = url_chomp url
250
+ return unless @processed_pdfs[url] == nil
251
+ @log.info "Process pdf #{url}"
252
+ dest = get_dest_path url
253
+ if not @conf["pdfs"]["download_once"]
254
+ header_date = get_file_header(url)["Last-Modified"]
255
+ header_time = header_date != nil ? Time.parse(header_date).to_i : Time.now.to_i
256
+ file_time = File.exists?(dest) ? File.mtime(dest).to_i : 0
257
+ @log.info "Process pdf #{url} with mtime #{header_time}, file mtime #{file_time}"
258
+ if file_time >= header_time
259
+ @log.info "Destination file #{dest} isn't younger => no download"
260
+ return
261
+ end
262
+ elsif File.exists? dest
263
+ @log.info "Destination file exists => no download"
264
+ return
265
+ end
266
+ `mkdir -p "#{File.dirname(dest)}"` unless File.exists? File.dirname(dest)
267
+ @log.info "Destination file #{dest} is older => download"
268
+ begin
269
+ @log.debug(fetch_url url, dest)
270
+ rescue => ex
271
+ @log.error "Downloading #{url} failed"
272
+ @log.error ex
273
+ end
274
+ @processed_pdfs[url] = dest
275
+ access_pause_sleep
276
+ end
277
+
278
+ def get_dest_path url
279
+ url_path = get_uri(url).path
280
+ src_path = @conf["pdfs"]["src_path"]
281
+ dest_folder = @conf["pdfs"]["dest_folder"]
282
+ dest_folder + "/" + url_path.slice(src_path.length, url_path.length - src_path.length)
283
+ end
284
+
285
+ def self.add_service_class name, description, service_class, needs_auth = true, url_regex = nil
286
+ @@service_classes[name] = {
287
+ "class" => service_class,
288
+ "url_regex" => url_regex,
289
+ "description" => description,
290
+ "needs_auth" => needs_auth
291
+ }
292
+ end
293
+
294
+ def get_uri url
295
+ if @uri_cache[url] == nil
296
+ @uri_cache[url] = URI.parse url
297
+ end
298
+ return @uri_cache[url]
299
+ end
300
+
301
+ def url_chomp url
302
+ uri = get_uri url
303
+ uri.scheme + "://" + uri.host + uri.path + (uri.query != nil ? "?#{uri.query}" : "")
304
+ end
305
+
306
+ def self.get_services
307
+ @@service_classes.clone
308
+ end
309
+
310
+ def self.get_service_for_url url
311
+ @@service_classes.each do |name, service|
312
+ unless service["url_regex"] == nil && service["url_regex"] =~ url
313
+ return name
314
+ end
315
+ end
316
+ return "base"
317
+ end
318
+
319
+ self.add_service_class "base", "without any authentication", self, false, nil
320
+
321
+ end
322
+
323
+ class SecuredService < BaseService
324
+
325
+ def authenticate
326
+ unless @conf["auth"] != nil && @conf["auth"]["user"] != nil && @conf["auth"]["pass"]
327
+ raise "No authentication (user name and password) given"
328
+ end
329
+ _authenticate
330
+ end
331
+
332
+ def _authenticate
333
+ end
334
+
335
+ end
336
+
337
+ class StudiumKITService < SecuredService
338
+
339
+ def _authenticate
340
+ login_url = "https://studium.kit.edu/_layouts/login.aspx?ReturnUrl=%2f"
341
+ html = nil
342
+ post_html = nil
343
+ begin
344
+ html = fetch_url login_url
345
+ params = {
346
+ "ctl00$PlaceHolderMain$Login$UserName" => @conf["auth"]["user"],
347
+ "ctl00$PlaceHolderMain$Login$password" => @conf["auth"]["pass"],
348
+ "ctl00$PlaceHolderMain$Login$loginbutton" => "Anmelden",
349
+ "__VIEWSTATE" => get_field_value(html, "__VIEWSTATE"),
350
+ "__EVENTVALIDATION" => get_field_value(html, "__EVENTVALIDATION"),
351
+ "__spDummyText1" => "",
352
+ "__spDummyText2" => ""
353
+ }
354
+ rescue => ex
355
+ @log.fatal ex
356
+ raise "Fetching and parsing login page failed"
357
+ end
358
+ begin
359
+ post_html = post login_url, params
360
+ rescue => ex
361
+ @log.fatal ex
362
+ raise "POST request to login page failed"
363
+ end
364
+ if post_html == html
365
+ raise "Authentication failed, wrong user name or password"
366
+ end
367
+ end
368
+
369
+ self.add_service_class "studium_kit", "studium.kit.edu service", self, true, /studium\.kit\.edu/
370
+
371
+ end
372
+
373
+ class HTTPAuthService < SecuredService
374
+
375
+ def _authenticate
376
+ @auth_app = "-u #{URI::escape @conf["auth"]["user"]}:#{URI::escape @conf["auth"]["pass"]}"
377
+ end
378
+
379
+ self.add_service_class "http_auth", "http authenticated service", self, true, nil
380
+
381
+ end
metadata ADDED
@@ -0,0 +1,120 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: kitcrawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Johannes Bechberger
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2014-07-12 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: nokogiri
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: 1.6.1
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: 1.6.1
30
+ - !ruby/object:Gem::Dependency
31
+ name: highline
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: 1.6.0
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: 1.6.0
46
+ - !ruby/object:Gem::Dependency
47
+ name: thor
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: 0.19.0
54
+ type: :runtime
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: 0.19.0
62
+ - !ruby/object:Gem::Dependency
63
+ name: damerau-levenshtein
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: 1.0.0
70
+ type: :runtime
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: 1.0.0
78
+ description: ! "\tCrawl lecture websites and fetch the PDFs automatically. \n\tIt
79
+ currently supports the studium.kit.edu and other HTTP password protected sites.\n"
80
+ email: me@mostlynerdless.de
81
+ executables:
82
+ - kitcrawler
83
+ extensions: []
84
+ extra_rdoc_files:
85
+ - README.md
86
+ files:
87
+ - lib/kitcrawler.rb
88
+ - lib/services.rb
89
+ - lib/cli.rb
90
+ - lib/cli_add.rb
91
+ - README.md
92
+ - bin/kitcrawler
93
+ homepage: https://github.com/parttimenerd/KITCrawler
94
+ licenses:
95
+ - GPL v3
96
+ post_install_message:
97
+ rdoc_options: []
98
+ require_paths:
99
+ - lib
100
+ required_ruby_version: !ruby/object:Gem::Requirement
101
+ none: false
102
+ requirements:
103
+ - - ! '>='
104
+ - !ruby/object:Gem::Version
105
+ version: 1.8.6
106
+ required_rubygems_version: !ruby/object:Gem::Requirement
107
+ none: false
108
+ requirements:
109
+ - - ! '>='
110
+ - !ruby/object:Gem::Version
111
+ version: '0'
112
+ requirements:
113
+ - Linux (other UNIXes might also work)
114
+ - curl
115
+ rubyforge_project:
116
+ rubygems_version: 1.8.23
117
+ signing_key:
118
+ specification_version: 3
119
+ summary: Fetch lecture PDFs with ease
120
+ test_files: []