kitcrawler 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,48 @@
1
+ KITCrawler
2
+ ===================
3
+ Fetch lecture PDFs with ease.
4
+
5
+
6
+ It currently supports crawling PDFs for lectures from the studium.kit.edu page,
7
+ but can be easily extended to fetch PDFs from other services.
8
+
9
+ Requirements
10
+ -------------------
11
+ - ruby (>= 1.9, but 1.8 might also be okay)
12
+ - bundler (or install the required gems (see `Gemfile`) manually)
13
+ - linux (with curl, might also work on other Unixes)
14
+
15
+ Install
16
+ -------------------
17
+ Simply run
18
+ ```
19
+ gem install kitcrawler
20
+ ```
21
+ to install the gem (it's often a bit behind the repo).
22
+
23
+ Or run it from source.
24
+ ```
25
+ git clone https://github.com/parttimenerd/KITCrawler
26
+ cd KITCrawler
27
+ bundle install
28
+ ``
29
+
30
+
31
+ Usage
32
+ -------------------
33
+ Run
34
+ ```
35
+ kitcrawler add NAME
36
+ ```
37
+ to add a new fetch job named `NAME`. This will prompt you to pass an entry URL to the site, etc.
38
+
39
+ To finally run your jobs use
40
+ ```
41
+ kitcrawler fetch
42
+ ```
43
+
44
+ It also supports some command line parameters, run `kitcrawler` to see an explanation.
45
+
46
+ License
47
+ -------------------
48
+ The code is GNU GPL v3 licensed.
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ #require 'kitcrawler'
4
+ require_relative '../lib/kitcrawler.rb'
5
+ KITCrawler::run_cli
@@ -0,0 +1,162 @@
1
+ require 'pp'
2
+ require 'json'
3
+ require 'thor'
4
+ require_relative 'services.rb'
5
+ require_relative 'cli_add.rb'
6
+
7
+ def kitgrawler_dir
8
+ `echo ~/.kitcrawler`.strip
9
+ end
10
+
11
+ class CLI < Thor
12
+ class_option :config_file, :type => :string, :aliases => "-c",
13
+ :default => "#{kitgrawler_dir}/config.json",
14
+ :desc => "Use CONFIG_FILE as config file location"
15
+ class_option :auth_file, :type => :string, :aliases => "-a",
16
+ :default => "#{kitgrawler_dir}/auth.json",
17
+ :desc => "Use AUTH_FILE as authentication location"
18
+ class_option :url_cache_file, :type => :string, :aliases => "-u",
19
+ :default => "#{kitgrawler_dir}/url_type_cache.json",
20
+ :desc => "Use URL_CACHE_FILE as url cache file location"
21
+ class_option :debug, :type => :boolean, :default => false, :aliases => "-d",
22
+ :desc => "Print everything to standard out"
23
+ class_option :verbose, :type => :boolean, :default => false, :aliases => "-v",
24
+ :desc => "Print a lot to standard out"
25
+ class_option :warn, :type => :boolean, :default => false, :aliases => "-w",
26
+ :desc => "Print only warnings and errors to standard out"
27
+ class_option :quiet, :type => :boolean, :default => false, :aliases => "-q",
28
+ :desc => "Print nothing to standard out"
29
+
30
+ desc "fetch [NAME]", "Run fetch job NAME or all jobs"
31
+ def fetch name
32
+ if name == nil
33
+ CLIHelper.new(options).fetch_all
34
+ else
35
+ CLIHelper.new(options).fetch name
36
+ end
37
+ end
38
+
39
+ desc "add NAME", "Add new job NAME"
40
+ def add name
41
+ CLI_ADD.add_config_ui name, options
42
+ end
43
+
44
+ end
45
+
46
+ class CLIHelper
47
+
48
+ def initialize options
49
+ @options = options
50
+ `mkdir -p #{kitgrawler_dir}`
51
+ init_logger
52
+ load_conf_file
53
+ load_auth_file
54
+ load_url_type_cache_file
55
+ end
56
+
57
+ def init_logger
58
+ @log = Logger.new STDOUT
59
+ @log_level = Logger::WARN
60
+ @log_level = Logger::DEBUG if @options[:debug]
61
+ @log_level = Logger::INFO if @options[:verbose]
62
+ @log_level = Logger::WARN if @options[:warn]
63
+ @log_level = Logger::UNKOWN if @options[:quiet]
64
+ @log.level = @log_level
65
+ @log.progname = "cli"
66
+ end
67
+
68
+ def load_conf_file
69
+ @conf = {}
70
+ return unless File.exists?(@options[:config_file])
71
+ begin
72
+ @conf = JSON.load File.read(@options[:config_file])
73
+ rescue => ex
74
+ @log.fatal "Cannot load config file #{@options[:config_file]}"
75
+ @log.fatal ex
76
+ exit 1
77
+ end
78
+ end
79
+
80
+ def load_auth_file
81
+ @auth_conf = {}
82
+ return unless File.exists?(@options[:auth_file])
83
+ begin
84
+ @auth_conf = JSON.load File.read(@options[:auth_file])
85
+ rescue => ex
86
+ @log.fatal "Cannot load authentication config file #{@options[:auth_file]}"
87
+ @log.fatal ex
88
+ exit 1
89
+ end
90
+ end
91
+
92
+ def load_url_type_cache_file
93
+ @url_type_cache = {}
94
+ return unless File.exists?(@options[:url_cache_file])
95
+ begin
96
+ @url_type_cache = JSON.load File.read(@options[:url_cache_file])
97
+ rescue => ex
98
+ @log.fatal "Cannot load url type cache file #{@options[:url_cache_file]}"
99
+ @log.fatal ex
100
+ exit 1
101
+ end
102
+ end
103
+
104
+ def fetch_all
105
+ begin
106
+ @conf.each_key do |grawl_job|
107
+ fetch grawl_job
108
+ end
109
+ rescue => ex
110
+ @log.fatal "Error grawling configured locations"
111
+ @log.fatal ex
112
+ exit 1
113
+ ensure
114
+ File.open(@options[:url_cache_file], "w") do |f|
115
+ f.puts JSON::pretty_generate @url_type_cache
116
+ end
117
+ end
118
+ end
119
+
120
+ def fetch job_name
121
+ grawl_location = job_name
122
+ conf = @conf[job_name]
123
+ if conf == nil
124
+ print_job_name_guess job_name
125
+ return
126
+ end
127
+ begin
128
+ service = BaseService::get_service grawl_location, conf, @auth_conf, @log_level, @url_type_cache
129
+ begin
130
+ service.execute
131
+ rescue => ex
132
+ @log.error "Failed executing #{grawl_location}"
133
+ @log.error ex
134
+ end
135
+ rescue => ex
136
+ @log.error "Failed to instantiate #{grawl_location}"
137
+ @log.error ex
138
+ end
139
+ end
140
+
141
+ def print_job_name_guess job_name
142
+ unless @options[:quiet]
143
+ puts "There is no job '#{job_name}'."
144
+ puts "Maybe you meant one of the following"
145
+ best_n_matches(@conf.keys, job_name, 3).each do |name|
146
+ puts " #{name}"
147
+ end
148
+ end
149
+ end
150
+
151
+ def best_n_matches arr, comp, n
152
+ require 'damerau-levenshtein'
153
+ map = {}
154
+ dl = DamerauLevenshtein
155
+ arr.each do |str|
156
+ map[str] = dl.distance(str, comp, 2)
157
+ end
158
+ return arr.sort {|a, b| map[a] <=> map[b] }
159
+ end
160
+ end
161
+
162
+
@@ -0,0 +1,151 @@
1
+ require 'pp'
2
+ require 'json'
3
+ require_relative 'services.rb'
4
+
5
+ module CLI_ADD
6
+
7
+ require 'highline/import'
8
+
9
+ def self.add_config_ui name, options
10
+ CLIHelper.new options
11
+ config_json = JSON.load File.read(options[:config_file])
12
+ auth_json = JSON.load File.read(options[:auth_file])
13
+ say("Configure #{name}")
14
+ name = check_name name, config_json
15
+ conf = {}
16
+ conf["entry_url"] = ask_entry_url
17
+ conf["type"] = ask_type conf
18
+ conf["pdfs"] = ask_pdfs conf
19
+ if BaseService.get_services["needs_auth"]
20
+ conf["auth"] = ask_auth name, conf, auth_json
21
+ end
22
+ config_json[name] = conf
23
+ say "This configuration is placed into your config files."
24
+ say "Your config file is #{options[:config_file]}."
25
+ say "Your authentication config file is #{options[:auth_file]}."
26
+ File.open(options[:config_file], "w") do |f|
27
+ f.puts JSON.pretty_generate config_json
28
+ end
29
+ File.open(options[:auth_file], "w") do |f|
30
+ f.puts JSON.pretty_generate auth_json
31
+ end
32
+ end
33
+
34
+ def self.check_name name, config_json
35
+ names = config_json.keys
36
+ while names.include? name
37
+ name = ask "Fetch job name (#{name} is already in use)? "
38
+ end
39
+ return name
40
+ end
41
+
42
+ def self.ask_entry_url
43
+ return ask_url "Entry point url? "
44
+ end
45
+
46
+ def self.ask_type conf
47
+ default = BaseService::get_service_for_url conf["entry_url"]
48
+ choose do |menu|
49
+ menu.prompt = "Service type [#{default}]? "
50
+ menu.default = default
51
+ BaseService::get_services.each do |name, service|
52
+ menu.choices("#{name} (#{service["description"]})") do |q|
53
+ say "You've choosen '#{name}'."
54
+ return name
55
+ end
56
+ end
57
+ end
58
+ end
59
+
60
+ def self.ask_pdfs conf
61
+ return {
62
+ "src_folder" => ask_non_empty("Source folder url (relative to entry url directory if starts with dot)? "),
63
+ "dest_folder" => ask_non_empty("Destination folder (relative to $HOME)? "),
64
+ "download_once" => ask_yes_no(
65
+ "Dowload a PDF only once (ignore changes, boost performance)? ", "yes"
66
+ ) == "yes"
67
+ }
68
+ end
69
+
70
+ def self.ask_auth name, conf, auth_json
71
+ is_studium_kit = conf["type"] == "studium_kit"
72
+ has_s_kit_auth = auth_json["studium_kit"] != nil
73
+ has_name_auth = auth_json[name] != nil
74
+ default = ""
75
+ if ask_yes_no("Auth: Use existing user/password configuration? ", is_studium_kit && has_s_kit_auth ? "yes" : "no") == "yes"
76
+ if is_studium_kit && has_s_kit_auth
77
+ default = "studium_kit"
78
+ end
79
+ choose do |menu|
80
+ menu.prompt = "Auth: Which configuration? "
81
+ menu.default = default unless default.empty?
82
+ auth_json.each do |name, config|
83
+ menu.choices("#{name} (user: #{config["user"]})") do |q|
84
+ say "Auth: You've chosen '#{name}'."
85
+ return name
86
+ end
87
+ end
88
+ end
89
+ else
90
+ auths = auth_json.keys
91
+ auth_name = ""
92
+ if is_studium_kit && !has_s_kit_auth
93
+ default = "studium_kit"
94
+ auth_name = ask "Auth: Configuration name [studium_kit]? " do |q|
95
+ q.default = studium_kit
96
+ end
97
+ elsif not has_name_auth
98
+ auth_name = ask "Auth: Configuration name [#{name}]? " do |q|
99
+ q.default = name
100
+ end
101
+ else
102
+ auth_name = ask_non_empty "Auth: Configuration name? "
103
+ end
104
+ begin
105
+ auth_json[auth_name] = ask_user_pass
106
+ end while ask_yes_no("Auth: Confirm that you're credentials are right. Are they? ", "yes") == "no"
107
+ return auth_name
108
+ end
109
+ end
110
+
111
+ def self.ask_user_pass
112
+ user = ask "Auth: User name? "
113
+ pass = ""
114
+ pass2 = ""
115
+ begin
116
+ pass = ask("Auth: Password? ") { |q| q.echo = "x" }
117
+ pass2 = ask("Auth: Retype it ") { |q| q.echo = "x" }
118
+ end while pass != pass2
119
+ return {
120
+ "user" => user,
121
+ "pass" => pass
122
+ }
123
+ end
124
+
125
+ def self.ask_url question
126
+ str = ""
127
+ while str.strip.length < 4
128
+ str = ask(question) || ""
129
+ end
130
+ return str.strip
131
+ end
132
+
133
+ def self.ask_non_empty question
134
+ str = ""
135
+ while str.strip.empty?
136
+ str = ask(question) || ""
137
+ end
138
+ return str.strip
139
+ end
140
+
141
+ def self.ask_yes_no question, default = "yes"
142
+ choose do |menu|
143
+ menu.layout = :one_line
144
+ menu.prompt = "#{question} [#{default}] "
145
+ menu.default = default
146
+ menu.choices(:yes, :no) do |q|
147
+ return q.to_s
148
+ end
149
+ end
150
+ end
151
+ end
@@ -0,0 +1,16 @@
1
+ #!/bin/ruby
2
+
3
+ require 'logger'
4
+ require 'json'
5
+ require 'optparse'
6
+
7
+ module KITCrawler
8
+
9
+ require_relative 'services.rb'
10
+ require_relative 'cli_add.rb'
11
+ require_relative 'cli.rb'
12
+
13
+ def self.run_cli
14
+ CLI.start ARGV
15
+ end
16
+ end
@@ -0,0 +1,381 @@
1
+ require 'nokogiri'
2
+ require 'cgi'
3
+ require 'pp'
4
+ require 'uri'
5
+ require 'logger'
6
+ require 'set'
7
+ require 'time'
8
+ require 'uri'
9
+ require 'json'
10
+
11
+ class BaseService
12
+
13
+ @conf = {}
14
+ @auth_app = ""
15
+ @file_header_cache = {} #url => splitted HTTP header lines
16
+ @processed_pdfs = {} #url => dest file
17
+ @name = ""
18
+ @@log
19
+ @@service_classes = {}
20
+ @type_cache = {}
21
+ @uri_cache = {} #url => URI
22
+
23
+ def initialize name, conf, auth_conf = {}, log_level = Logger::WARN, url_type_cache = {}
24
+ @base_dir = `echo ~`.strip
25
+ @uri_cache = {}
26
+ @file_header_cache = {}
27
+ @type_cache = url_type_cache
28
+ @processed_pdfs = {}
29
+ @name = name
30
+ @log = Logger.new(STDOUT)
31
+ @log.progname = name
32
+ @log.level = log_level
33
+
34
+ @conf = {
35
+ "type" => "base",
36
+ "exclude_file_endings" => [".css", ".js", ".txt", ".rss", ".atom"],
37
+ "access_pause" => { #in seconds
38
+ "min" => 0.1,
39
+ "max" => 0.3
40
+ },
41
+ "pdfs" => {
42
+ "src_folder" => "abc.de/a", #is relative to entry_url base dir if starts with dot
43
+ "dest_folder" => "abcd",
44
+ "download_once" => true
45
+ },
46
+ "cookie_jar" => "cookies.txt",
47
+ "user_agent" => "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:30.0) Gecko/20100101 Firefox/30.0",
48
+ "entry_url" => "",
49
+ "auth" => "base" #references auth conf or {"user" => "", "pass" => ""}
50
+ }
51
+ temp_conf = @conf.merge conf
52
+ unless conf["pdfs"] == nil
53
+ temp_conf["pdfs"] = @conf["pdfs"].merge conf["pdfs"]
54
+ end
55
+ @conf = temp_conf
56
+ if @conf["auth"].is_a? String
57
+ @conf["auth"] = auth_conf[@conf["auth"]]
58
+ @log.debug "Load auth from auth config #{auth_conf}"
59
+ end
60
+
61
+ if @conf["pdfs"]["src_folder"].start_with? "."
62
+ entry_uri = get_uri @conf["entry_url"]
63
+ entry_path_url = entry_uri.scheme + "://" + entry_uri.host + File.dirname(entry_uri.path)
64
+ @conf["pdfs"]["src_folder"] = "#{entry_path_url}/#{@conf["pdfs"]["src_folder"]}"
65
+ @log.info "Source folder is #{@conf["pdfs"]["src_folder"]}"
66
+ end
67
+
68
+ src_url_parsed = URI.parse(@conf["pdfs"]["src_folder"])
69
+ @conf["pdfs"]["src_path"] = src_url_parsed.path
70
+ @conf["pdfs"]["src_host"] = src_url_parsed.host
71
+ @log.info "Start authentication"
72
+ authenticate
73
+ @log.info "Authentication completed"
74
+ end
75
+
76
+ def self.get_service name, conf, auth_conf={}, log_level = Logger::WARN, url_type_cache = {}
77
+ service = @@service_classes[conf["type"]]
78
+ if service == nil
79
+ raise "Unknown service #{conf["type"]}"
80
+ else
81
+ service["class"].new name, conf, auth_conf, log_level, url_type_cache
82
+ end
83
+ end
84
+
85
+ def authenticate
86
+ ""
87
+ end
88
+
89
+ def execute
90
+ @log.info "Start grawling #{@conf["entry_url"]}"
91
+ parse_html_page @conf["entry_url"]
92
+ @log.info "Completed grawling #{@conf["entry_url"]}"
93
+ end
94
+
95
+ def parse_html_page url, url_cache = Set.new
96
+ url = url_chomp url
97
+ return if url_cache.member?(url)
98
+ url_cache.add url
99
+ @log.info "Fetch and parse #{url}"
100
+ html = ""
101
+ begin
102
+ html = fetch_url url
103
+ access_pause_sleep
104
+ rescue => ex
105
+ @log.error "Cannot fetch #{url}"
106
+ @log.error ex
107
+ return
108
+ end
109
+ links = parse_html url, html
110
+ links["html"].each do |html_link|
111
+ parse_html_page html_link, url_cache
112
+ end
113
+ links["pdf"].each do |pdf_link|
114
+ process_pdf pdf_link
115
+ end
116
+ end
117
+
118
+ ##
119
+ #Executes curl to fetch the requested url
120
+ #@param url requested url
121
+ #@param output_file output destination, if nil the output gets returned by
122
+ #this method
123
+ def fetch_url url, output_file=nil, curl_params=""
124
+ curl_params = "#{@auth_app} #{curl_params} --silent --user-agent \"#{@conf["user_agent"]}\""
125
+ curl_params += " -b #{@conf["cookie_jar"]} -c #{@conf["cookie_jar"]} -L -o \"#{output_file || "-"}\" #{url}"
126
+ @log.debug "Call curl on #{url}"
127
+ @log.debug "Curl parameters '#{curl_params}'"
128
+ `cd #{@base_dir}; curl #{curl_params}`
129
+ end
130
+
131
+ def post url, params, output_file=nil, curl_params=""
132
+ param_arr = []
133
+ params.each do |key, value|
134
+ param_arr << "#{CGI::escape(key)}=#{CGI::escape(value)}"
135
+ end
136
+ param = param_arr.join "&"
137
+ begin
138
+ fetch_url url, output_file, "#{curl_params} --data \"#{param}\""
139
+ rescue => ex
140
+ @log.error "Failed to POST #{url} with data #{params}"
141
+ @log.error ex
142
+ ""
143
+ end
144
+ end
145
+
146
+ def parse_html url, html
147
+ doc = nil
148
+ links = {'pdf' => [], 'html' => []}
149
+ begin
150
+ doc = Nokogiri::HTML html
151
+ rescue => ex
152
+ @log.error "Parsing html from url #{url} failed"
153
+ return links
154
+ end
155
+ doc.css('a[href]').each do |link|
156
+ begin
157
+ link_url = url_chomp(URI.join(url, link.attributes["href"]).to_s).to_s
158
+ @log.debug "Process link #{link_url}"
159
+ if is_pdf_url link_url
160
+ links['pdf'] << link_url
161
+ @log.debug "#{link_url} is pdf"
162
+ elsif is_html_url link_url
163
+ links['html'] << link_url
164
+ @log.debug "#{link_url} is html"
165
+ end
166
+ rescue => ex
167
+ @log.debug "Omit #{link}"
168
+ end
169
+ end
170
+ return links
171
+ end
172
+
173
+ def get_field_value html, field
174
+ doc = nil
175
+ begin
176
+ doc = Nokogiri::HTML html
177
+ rescue => ex
178
+ @log.error "Parsing html failed"
179
+ @log.error ex
180
+ return ""
181
+ end
182
+ value = ""
183
+ doc.css("##{field}").each do |link|
184
+ value = link.attributes["value"].to_s
185
+ end
186
+ return value
187
+ end
188
+
189
+ def get_type url
190
+ if is_excluded url
191
+ return ""
192
+ end
193
+ if @type_cache[url] == nil
194
+ if url.upcase.end_with?(".PDF") ||
195
+ get_file_header(url)["Content-Type"].start_with?("application/pdf", "application/x-pdf")
196
+ @type_cache[url] = "pdf"
197
+ elsif get_file_header(url)["Content-Type"].start_with?("text/html")
198
+ @type_cache[url] = "html"
199
+ else
200
+ @type_cache[url] = ""
201
+ end
202
+ end
203
+ return @type_cache[url]
204
+ end
205
+
206
+ def is_pdf_url url
207
+ get_type(url) == "pdf"
208
+ end
209
+
210
+ def is_html_url url
211
+ get_type(url) == "html"
212
+ end
213
+
214
+ def is_excluded url
215
+ parsed_url = get_uri url
216
+ parsed_url.path.send(:start_with?, @conf["exclude_file_endings"]) ||
217
+ parsed_url.host != @conf["pdfs"]["src_host"] ||
218
+ !parsed_url.path.start_with?(@conf["pdfs"]["src_path"])
219
+ end
220
+
221
+ def access_pause_sleep
222
+ min = @conf["access_pause"]["min"]
223
+ max = @conf["access_pause"]["max"]
224
+ duration = Random.rand() * (max - min) + min
225
+ @log.debug "Sleep #{duration} seconds to behave a bit more human"
226
+ sleep duration
227
+ end
228
+
229
+ def get_file_header url
230
+ url = url_chomp url
231
+ if @file_header_cache[url] == nil
232
+ header = fetch_url url, "-", "-I"
233
+ lines = header.split("\r\n").map {|val| val.split(": ") }
234
+ response = {}
235
+ lines.each {|arr| response[arr[0]] = arr[1] }
236
+ @file_header_cache[url] = response
237
+ @log.info "Fetch header of #{url}"
238
+ access_pause_sleep
239
+ end
240
+ return @file_header_cache[url]
241
+ end
242
+
243
+ def get_path_url url
244
+ parsed = get_uri url
245
+ parsed.path + (parsed.query != "" ? "?#{parsed.query}": "")
246
+ end
247
+
248
+ def process_pdf url
249
+ url = url_chomp url
250
+ return unless @processed_pdfs[url] == nil
251
+ @log.info "Process pdf #{url}"
252
+ dest = get_dest_path url
253
+ if not @conf["pdfs"]["download_once"]
254
+ header_date = get_file_header(url)["Last-Modified"]
255
+ header_time = header_date != nil ? Time.parse(header_date).to_i : Time.now.to_i
256
+ file_time = File.exists?(dest) ? File.mtime(dest).to_i : 0
257
+ @log.info "Process pdf #{url} with mtime #{header_time}, file mtime #{file_time}"
258
+ if file_time >= header_time
259
+ @log.info "Destination file #{dest} isn't younger => no download"
260
+ return
261
+ end
262
+ elsif File.exists? dest
263
+ @log.info "Destination file exists => no download"
264
+ return
265
+ end
266
+ `mkdir -p "#{File.dirname(dest)}"` unless File.exists? File.dirname(dest)
267
+ @log.info "Destination file #{dest} is older => download"
268
+ begin
269
+ @log.debug(fetch_url url, dest)
270
+ rescue => ex
271
+ @log.error "Downloading #{url} failed"
272
+ @log.error ex
273
+ end
274
+ @processed_pdfs[url] = dest
275
+ access_pause_sleep
276
+ end
277
+
278
+ def get_dest_path url
279
+ url_path = get_uri(url).path
280
+ src_path = @conf["pdfs"]["src_path"]
281
+ dest_folder = @conf["pdfs"]["dest_folder"]
282
+ dest_folder + "/" + url_path.slice(src_path.length, url_path.length - src_path.length)
283
+ end
284
+
285
+ def self.add_service_class name, description, service_class, needs_auth = true, url_regex = nil
286
+ @@service_classes[name] = {
287
+ "class" => service_class,
288
+ "url_regex" => url_regex,
289
+ "description" => description,
290
+ "needs_auth" => needs_auth
291
+ }
292
+ end
293
+
294
+ def get_uri url
295
+ if @uri_cache[url] == nil
296
+ @uri_cache[url] = URI.parse url
297
+ end
298
+ return @uri_cache[url]
299
+ end
300
+
301
+ def url_chomp url
302
+ uri = get_uri url
303
+ uri.scheme + "://" + uri.host + uri.path + (uri.query != nil ? "?#{uri.query}" : "")
304
+ end
305
+
306
+ def self.get_services
307
+ @@service_classes.clone
308
+ end
309
+
310
+ def self.get_service_for_url url
311
+ @@service_classes.each do |name, service|
312
+ unless service["url_regex"] == nil && service["url_regex"] =~ url
313
+ return name
314
+ end
315
+ end
316
+ return "base"
317
+ end
318
+
319
+ self.add_service_class "base", "without any authentication", self, false, nil
320
+
321
+ end
322
+
323
+ class SecuredService < BaseService
324
+
325
+ def authenticate
326
+ unless @conf["auth"] != nil && @conf["auth"]["user"] != nil && @conf["auth"]["pass"]
327
+ raise "No authentication (user name and password) given"
328
+ end
329
+ _authenticate
330
+ end
331
+
332
+ def _authenticate
333
+ end
334
+
335
+ end
336
+
337
+ class StudiumKITService < SecuredService
338
+
339
+ def _authenticate
340
+ login_url = "https://studium.kit.edu/_layouts/login.aspx?ReturnUrl=%2f"
341
+ html = nil
342
+ post_html = nil
343
+ begin
344
+ html = fetch_url login_url
345
+ params = {
346
+ "ctl00$PlaceHolderMain$Login$UserName" => @conf["auth"]["user"],
347
+ "ctl00$PlaceHolderMain$Login$password" => @conf["auth"]["pass"],
348
+ "ctl00$PlaceHolderMain$Login$loginbutton" => "Anmelden",
349
+ "__VIEWSTATE" => get_field_value(html, "__VIEWSTATE"),
350
+ "__EVENTVALIDATION" => get_field_value(html, "__EVENTVALIDATION"),
351
+ "__spDummyText1" => "",
352
+ "__spDummyText2" => ""
353
+ }
354
+ rescue => ex
355
+ @log.fatal ex
356
+ raise "Fetching and parsing login page failed"
357
+ end
358
+ begin
359
+ post_html = post login_url, params
360
+ rescue => ex
361
+ @log.fatal ex
362
+ raise "POST request to login page failed"
363
+ end
364
+ if post_html == html
365
+ raise "Authentication failed, wrong user name or password"
366
+ end
367
+ end
368
+
369
+ self.add_service_class "studium_kit", "studium.kit.edu service", self, true, /studium\.kit\.edu/
370
+
371
+ end
372
+
373
+ class HTTPAuthService < SecuredService
374
+
375
+ def _authenticate
376
+ @auth_app = "-u #{URI::escape @conf["auth"]["user"]}:#{URI::escape @conf["auth"]["pass"]}"
377
+ end
378
+
379
+ self.add_service_class "http_auth", "http authenticated service", self, true, nil
380
+
381
+ end
metadata ADDED
@@ -0,0 +1,120 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: kitcrawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Johannes Bechberger
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2014-07-12 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: nokogiri
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: 1.6.1
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: 1.6.1
30
+ - !ruby/object:Gem::Dependency
31
+ name: highline
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: 1.6.0
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: 1.6.0
46
+ - !ruby/object:Gem::Dependency
47
+ name: thor
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: 0.19.0
54
+ type: :runtime
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: 0.19.0
62
+ - !ruby/object:Gem::Dependency
63
+ name: damerau-levenshtein
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: 1.0.0
70
+ type: :runtime
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: 1.0.0
78
+ description: ! "\tCrawl lecture websites and fetch the PDFs automatically. \n\tIt
79
+ currently supports the studium.kit.edu and other HTTP password protected sites.\n"
80
+ email: me@mostlynerdless.de
81
+ executables:
82
+ - kitcrawler
83
+ extensions: []
84
+ extra_rdoc_files:
85
+ - README.md
86
+ files:
87
+ - lib/kitcrawler.rb
88
+ - lib/services.rb
89
+ - lib/cli.rb
90
+ - lib/cli_add.rb
91
+ - README.md
92
+ - bin/kitcrawler
93
+ homepage: https://github.com/parttimenerd/KITCrawler
94
+ licenses:
95
+ - GPL v3
96
+ post_install_message:
97
+ rdoc_options: []
98
+ require_paths:
99
+ - lib
100
+ required_ruby_version: !ruby/object:Gem::Requirement
101
+ none: false
102
+ requirements:
103
+ - - ! '>='
104
+ - !ruby/object:Gem::Version
105
+ version: 1.8.6
106
+ required_rubygems_version: !ruby/object:Gem::Requirement
107
+ none: false
108
+ requirements:
109
+ - - ! '>='
110
+ - !ruby/object:Gem::Version
111
+ version: '0'
112
+ requirements:
113
+ - Linux (other UNIXes might also work)
114
+ - curl
115
+ rubyforge_project:
116
+ rubygems_version: 1.8.23
117
+ signing_key:
118
+ specification_version: 3
119
+ summary: Fetch lecture PDFs with ease
120
+ test_files: []