kitcrawler 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +48 -0
- data/bin/kitcrawler +5 -0
- data/lib/cli.rb +162 -0
- data/lib/cli_add.rb +151 -0
- data/lib/kitcrawler.rb +16 -0
- data/lib/services.rb +381 -0
- metadata +120 -0
data/README.md
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
KITCrawler
|
2
|
+
===================
|
3
|
+
Fetch lecture PDFs with ease.
|
4
|
+
|
5
|
+
|
6
|
+
It currently supports crawling PDFs for lectures from the studium.kit.edu page,
|
7
|
+
but can be easily extended to fetch PDFs from other services.
|
8
|
+
|
9
|
+
Requirements
|
10
|
+
-------------------
|
11
|
+
- ruby (>= 1.9, but 1.8 might also be okay)
|
12
|
+
- bundler (or install the required gems (see `Gemfile`) manually)
|
13
|
+
- linux (with curl, might also work on other Unixes)
|
14
|
+
|
15
|
+
Install
|
16
|
+
-------------------
|
17
|
+
Simply run
|
18
|
+
```
|
19
|
+
gem install kitcrawler
|
20
|
+
```
|
21
|
+
to install the gem (it's often a bit behind the repo).
|
22
|
+
|
23
|
+
Or run it from source.
|
24
|
+
```
|
25
|
+
git clone https://github.com/parttimenerd/KITCrawler
|
26
|
+
cd KITCrawler
|
27
|
+
bundle install
|
28
|
+
``
|
29
|
+
|
30
|
+
|
31
|
+
Usage
|
32
|
+
-------------------
|
33
|
+
Run
|
34
|
+
```
|
35
|
+
kitcrawler add NAME
|
36
|
+
```
|
37
|
+
to add a new fetch job named `NAME`. This will prompt you to pass an entry URL to the site, etc.
|
38
|
+
|
39
|
+
To finally run your jobs use
|
40
|
+
```
|
41
|
+
kitcrawler fetch
|
42
|
+
```
|
43
|
+
|
44
|
+
It also supports some command line parameters, run `kitcrawler` to see an explanation.
|
45
|
+
|
46
|
+
License
|
47
|
+
-------------------
|
48
|
+
The code is GNU GPL v3 licensed.
|
data/bin/kitcrawler
ADDED
data/lib/cli.rb
ADDED
@@ -0,0 +1,162 @@
|
|
1
|
+
require 'pp'
|
2
|
+
require 'json'
|
3
|
+
require 'thor'
|
4
|
+
require_relative 'services.rb'
|
5
|
+
require_relative 'cli_add.rb'
|
6
|
+
|
7
|
+
def kitgrawler_dir
|
8
|
+
`echo ~/.kitcrawler`.strip
|
9
|
+
end
|
10
|
+
|
11
|
+
class CLI < Thor
|
12
|
+
class_option :config_file, :type => :string, :aliases => "-c",
|
13
|
+
:default => "#{kitgrawler_dir}/config.json",
|
14
|
+
:desc => "Use CONFIG_FILE as config file location"
|
15
|
+
class_option :auth_file, :type => :string, :aliases => "-a",
|
16
|
+
:default => "#{kitgrawler_dir}/auth.json",
|
17
|
+
:desc => "Use AUTH_FILE as authentication location"
|
18
|
+
class_option :url_cache_file, :type => :string, :aliases => "-u",
|
19
|
+
:default => "#{kitgrawler_dir}/url_type_cache.json",
|
20
|
+
:desc => "Use URL_CACHE_FILE as url cache file location"
|
21
|
+
class_option :debug, :type => :boolean, :default => false, :aliases => "-d",
|
22
|
+
:desc => "Print everything to standard out"
|
23
|
+
class_option :verbose, :type => :boolean, :default => false, :aliases => "-v",
|
24
|
+
:desc => "Print a lot to standard out"
|
25
|
+
class_option :warn, :type => :boolean, :default => false, :aliases => "-w",
|
26
|
+
:desc => "Print only warnings and errors to standard out"
|
27
|
+
class_option :quiet, :type => :boolean, :default => false, :aliases => "-q",
|
28
|
+
:desc => "Print nothing to standard out"
|
29
|
+
|
30
|
+
desc "fetch [NAME]", "Run fetch job NAME or all jobs"
|
31
|
+
def fetch name
|
32
|
+
if name == nil
|
33
|
+
CLIHelper.new(options).fetch_all
|
34
|
+
else
|
35
|
+
CLIHelper.new(options).fetch name
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
desc "add NAME", "Add new job NAME"
|
40
|
+
def add name
|
41
|
+
CLI_ADD.add_config_ui name, options
|
42
|
+
end
|
43
|
+
|
44
|
+
end
|
45
|
+
|
46
|
+
class CLIHelper
|
47
|
+
|
48
|
+
def initialize options
|
49
|
+
@options = options
|
50
|
+
`mkdir -p #{kitgrawler_dir}`
|
51
|
+
init_logger
|
52
|
+
load_conf_file
|
53
|
+
load_auth_file
|
54
|
+
load_url_type_cache_file
|
55
|
+
end
|
56
|
+
|
57
|
+
def init_logger
|
58
|
+
@log = Logger.new STDOUT
|
59
|
+
@log_level = Logger::WARN
|
60
|
+
@log_level = Logger::DEBUG if @options[:debug]
|
61
|
+
@log_level = Logger::INFO if @options[:verbose]
|
62
|
+
@log_level = Logger::WARN if @options[:warn]
|
63
|
+
@log_level = Logger::UNKOWN if @options[:quiet]
|
64
|
+
@log.level = @log_level
|
65
|
+
@log.progname = "cli"
|
66
|
+
end
|
67
|
+
|
68
|
+
def load_conf_file
|
69
|
+
@conf = {}
|
70
|
+
return unless File.exists?(@options[:config_file])
|
71
|
+
begin
|
72
|
+
@conf = JSON.load File.read(@options[:config_file])
|
73
|
+
rescue => ex
|
74
|
+
@log.fatal "Cannot load config file #{@options[:config_file]}"
|
75
|
+
@log.fatal ex
|
76
|
+
exit 1
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
def load_auth_file
|
81
|
+
@auth_conf = {}
|
82
|
+
return unless File.exists?(@options[:auth_file])
|
83
|
+
begin
|
84
|
+
@auth_conf = JSON.load File.read(@options[:auth_file])
|
85
|
+
rescue => ex
|
86
|
+
@log.fatal "Cannot load authentication config file #{@options[:auth_file]}"
|
87
|
+
@log.fatal ex
|
88
|
+
exit 1
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
def load_url_type_cache_file
|
93
|
+
@url_type_cache = {}
|
94
|
+
return unless File.exists?(@options[:url_cache_file])
|
95
|
+
begin
|
96
|
+
@url_type_cache = JSON.load File.read(@options[:url_cache_file])
|
97
|
+
rescue => ex
|
98
|
+
@log.fatal "Cannot load url type cache file #{@options[:url_cache_file]}"
|
99
|
+
@log.fatal ex
|
100
|
+
exit 1
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
def fetch_all
|
105
|
+
begin
|
106
|
+
@conf.each_key do |grawl_job|
|
107
|
+
fetch grawl_job
|
108
|
+
end
|
109
|
+
rescue => ex
|
110
|
+
@log.fatal "Error grawling configured locations"
|
111
|
+
@log.fatal ex
|
112
|
+
exit 1
|
113
|
+
ensure
|
114
|
+
File.open(@options[:url_cache_file], "w") do |f|
|
115
|
+
f.puts JSON::pretty_generate @url_type_cache
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
def fetch job_name
|
121
|
+
grawl_location = job_name
|
122
|
+
conf = @conf[job_name]
|
123
|
+
if conf == nil
|
124
|
+
print_job_name_guess job_name
|
125
|
+
return
|
126
|
+
end
|
127
|
+
begin
|
128
|
+
service = BaseService::get_service grawl_location, conf, @auth_conf, @log_level, @url_type_cache
|
129
|
+
begin
|
130
|
+
service.execute
|
131
|
+
rescue => ex
|
132
|
+
@log.error "Failed executing #{grawl_location}"
|
133
|
+
@log.error ex
|
134
|
+
end
|
135
|
+
rescue => ex
|
136
|
+
@log.error "Failed to instantiate #{grawl_location}"
|
137
|
+
@log.error ex
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
def print_job_name_guess job_name
|
142
|
+
unless @options[:quiet]
|
143
|
+
puts "There is no job '#{job_name}'."
|
144
|
+
puts "Maybe you meant one of the following"
|
145
|
+
best_n_matches(@conf.keys, job_name, 3).each do |name|
|
146
|
+
puts " #{name}"
|
147
|
+
end
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
def best_n_matches arr, comp, n
|
152
|
+
require 'damerau-levenshtein'
|
153
|
+
map = {}
|
154
|
+
dl = DamerauLevenshtein
|
155
|
+
arr.each do |str|
|
156
|
+
map[str] = dl.distance(str, comp, 2)
|
157
|
+
end
|
158
|
+
return arr.sort {|a, b| map[a] <=> map[b] }
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
|
data/lib/cli_add.rb
ADDED
@@ -0,0 +1,151 @@
|
|
1
|
+
require 'pp'
|
2
|
+
require 'json'
|
3
|
+
require_relative 'services.rb'
|
4
|
+
|
5
|
+
module CLI_ADD
|
6
|
+
|
7
|
+
require 'highline/import'
|
8
|
+
|
9
|
+
def self.add_config_ui name, options
|
10
|
+
CLIHelper.new options
|
11
|
+
config_json = JSON.load File.read(options[:config_file])
|
12
|
+
auth_json = JSON.load File.read(options[:auth_file])
|
13
|
+
say("Configure #{name}")
|
14
|
+
name = check_name name, config_json
|
15
|
+
conf = {}
|
16
|
+
conf["entry_url"] = ask_entry_url
|
17
|
+
conf["type"] = ask_type conf
|
18
|
+
conf["pdfs"] = ask_pdfs conf
|
19
|
+
if BaseService.get_services["needs_auth"]
|
20
|
+
conf["auth"] = ask_auth name, conf, auth_json
|
21
|
+
end
|
22
|
+
config_json[name] = conf
|
23
|
+
say "This configuration is placed into your config files."
|
24
|
+
say "Your config file is #{options[:config_file]}."
|
25
|
+
say "Your authentication config file is #{options[:auth_file]}."
|
26
|
+
File.open(options[:config_file], "w") do |f|
|
27
|
+
f.puts JSON.pretty_generate config_json
|
28
|
+
end
|
29
|
+
File.open(options[:auth_file], "w") do |f|
|
30
|
+
f.puts JSON.pretty_generate auth_json
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def self.check_name name, config_json
|
35
|
+
names = config_json.keys
|
36
|
+
while names.include? name
|
37
|
+
name = ask "Fetch job name (#{name} is already in use)? "
|
38
|
+
end
|
39
|
+
return name
|
40
|
+
end
|
41
|
+
|
42
|
+
def self.ask_entry_url
|
43
|
+
return ask_url "Entry point url? "
|
44
|
+
end
|
45
|
+
|
46
|
+
def self.ask_type conf
|
47
|
+
default = BaseService::get_service_for_url conf["entry_url"]
|
48
|
+
choose do |menu|
|
49
|
+
menu.prompt = "Service type [#{default}]? "
|
50
|
+
menu.default = default
|
51
|
+
BaseService::get_services.each do |name, service|
|
52
|
+
menu.choices("#{name} (#{service["description"]})") do |q|
|
53
|
+
say "You've choosen '#{name}'."
|
54
|
+
return name
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def self.ask_pdfs conf
|
61
|
+
return {
|
62
|
+
"src_folder" => ask_non_empty("Source folder url (relative to entry url directory if starts with dot)? "),
|
63
|
+
"dest_folder" => ask_non_empty("Destination folder (relative to $HOME)? "),
|
64
|
+
"download_once" => ask_yes_no(
|
65
|
+
"Dowload a PDF only once (ignore changes, boost performance)? ", "yes"
|
66
|
+
) == "yes"
|
67
|
+
}
|
68
|
+
end
|
69
|
+
|
70
|
+
def self.ask_auth name, conf, auth_json
|
71
|
+
is_studium_kit = conf["type"] == "studium_kit"
|
72
|
+
has_s_kit_auth = auth_json["studium_kit"] != nil
|
73
|
+
has_name_auth = auth_json[name] != nil
|
74
|
+
default = ""
|
75
|
+
if ask_yes_no("Auth: Use existing user/password configuration? ", is_studium_kit && has_s_kit_auth ? "yes" : "no") == "yes"
|
76
|
+
if is_studium_kit && has_s_kit_auth
|
77
|
+
default = "studium_kit"
|
78
|
+
end
|
79
|
+
choose do |menu|
|
80
|
+
menu.prompt = "Auth: Which configuration? "
|
81
|
+
menu.default = default unless default.empty?
|
82
|
+
auth_json.each do |name, config|
|
83
|
+
menu.choices("#{name} (user: #{config["user"]})") do |q|
|
84
|
+
say "Auth: You've chosen '#{name}'."
|
85
|
+
return name
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
else
|
90
|
+
auths = auth_json.keys
|
91
|
+
auth_name = ""
|
92
|
+
if is_studium_kit && !has_s_kit_auth
|
93
|
+
default = "studium_kit"
|
94
|
+
auth_name = ask "Auth: Configuration name [studium_kit]? " do |q|
|
95
|
+
q.default = studium_kit
|
96
|
+
end
|
97
|
+
elsif not has_name_auth
|
98
|
+
auth_name = ask "Auth: Configuration name [#{name}]? " do |q|
|
99
|
+
q.default = name
|
100
|
+
end
|
101
|
+
else
|
102
|
+
auth_name = ask_non_empty "Auth: Configuration name? "
|
103
|
+
end
|
104
|
+
begin
|
105
|
+
auth_json[auth_name] = ask_user_pass
|
106
|
+
end while ask_yes_no("Auth: Confirm that you're credentials are right. Are they? ", "yes") == "no"
|
107
|
+
return auth_name
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
def self.ask_user_pass
|
112
|
+
user = ask "Auth: User name? "
|
113
|
+
pass = ""
|
114
|
+
pass2 = ""
|
115
|
+
begin
|
116
|
+
pass = ask("Auth: Password? ") { |q| q.echo = "x" }
|
117
|
+
pass2 = ask("Auth: Retype it ") { |q| q.echo = "x" }
|
118
|
+
end while pass != pass2
|
119
|
+
return {
|
120
|
+
"user" => user,
|
121
|
+
"pass" => pass
|
122
|
+
}
|
123
|
+
end
|
124
|
+
|
125
|
+
def self.ask_url question
|
126
|
+
str = ""
|
127
|
+
while str.strip.length < 4
|
128
|
+
str = ask(question) || ""
|
129
|
+
end
|
130
|
+
return str.strip
|
131
|
+
end
|
132
|
+
|
133
|
+
def self.ask_non_empty question
|
134
|
+
str = ""
|
135
|
+
while str.strip.empty?
|
136
|
+
str = ask(question) || ""
|
137
|
+
end
|
138
|
+
return str.strip
|
139
|
+
end
|
140
|
+
|
141
|
+
def self.ask_yes_no question, default = "yes"
|
142
|
+
choose do |menu|
|
143
|
+
menu.layout = :one_line
|
144
|
+
menu.prompt = "#{question} [#{default}] "
|
145
|
+
menu.default = default
|
146
|
+
menu.choices(:yes, :no) do |q|
|
147
|
+
return q.to_s
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|
151
|
+
end
|
data/lib/kitcrawler.rb
ADDED
data/lib/services.rb
ADDED
@@ -0,0 +1,381 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'cgi'
|
3
|
+
require 'pp'
|
4
|
+
require 'uri'
|
5
|
+
require 'logger'
|
6
|
+
require 'set'
|
7
|
+
require 'time'
|
8
|
+
require 'uri'
|
9
|
+
require 'json'
|
10
|
+
|
11
|
+
class BaseService
|
12
|
+
|
13
|
+
@conf = {}
|
14
|
+
@auth_app = ""
|
15
|
+
@file_header_cache = {} #url => splitted HTTP header lines
|
16
|
+
@processed_pdfs = {} #url => dest file
|
17
|
+
@name = ""
|
18
|
+
@@log
|
19
|
+
@@service_classes = {}
|
20
|
+
@type_cache = {}
|
21
|
+
@uri_cache = {} #url => URI
|
22
|
+
|
23
|
+
def initialize name, conf, auth_conf = {}, log_level = Logger::WARN, url_type_cache = {}
|
24
|
+
@base_dir = `echo ~`.strip
|
25
|
+
@uri_cache = {}
|
26
|
+
@file_header_cache = {}
|
27
|
+
@type_cache = url_type_cache
|
28
|
+
@processed_pdfs = {}
|
29
|
+
@name = name
|
30
|
+
@log = Logger.new(STDOUT)
|
31
|
+
@log.progname = name
|
32
|
+
@log.level = log_level
|
33
|
+
|
34
|
+
@conf = {
|
35
|
+
"type" => "base",
|
36
|
+
"exclude_file_endings" => [".css", ".js", ".txt", ".rss", ".atom"],
|
37
|
+
"access_pause" => { #in seconds
|
38
|
+
"min" => 0.1,
|
39
|
+
"max" => 0.3
|
40
|
+
},
|
41
|
+
"pdfs" => {
|
42
|
+
"src_folder" => "abc.de/a", #is relative to entry_url base dir if starts with dot
|
43
|
+
"dest_folder" => "abcd",
|
44
|
+
"download_once" => true
|
45
|
+
},
|
46
|
+
"cookie_jar" => "cookies.txt",
|
47
|
+
"user_agent" => "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:30.0) Gecko/20100101 Firefox/30.0",
|
48
|
+
"entry_url" => "",
|
49
|
+
"auth" => "base" #references auth conf or {"user" => "", "pass" => ""}
|
50
|
+
}
|
51
|
+
temp_conf = @conf.merge conf
|
52
|
+
unless conf["pdfs"] == nil
|
53
|
+
temp_conf["pdfs"] = @conf["pdfs"].merge conf["pdfs"]
|
54
|
+
end
|
55
|
+
@conf = temp_conf
|
56
|
+
if @conf["auth"].is_a? String
|
57
|
+
@conf["auth"] = auth_conf[@conf["auth"]]
|
58
|
+
@log.debug "Load auth from auth config #{auth_conf}"
|
59
|
+
end
|
60
|
+
|
61
|
+
if @conf["pdfs"]["src_folder"].start_with? "."
|
62
|
+
entry_uri = get_uri @conf["entry_url"]
|
63
|
+
entry_path_url = entry_uri.scheme + "://" + entry_uri.host + File.dirname(entry_uri.path)
|
64
|
+
@conf["pdfs"]["src_folder"] = "#{entry_path_url}/#{@conf["pdfs"]["src_folder"]}"
|
65
|
+
@log.info "Source folder is #{@conf["pdfs"]["src_folder"]}"
|
66
|
+
end
|
67
|
+
|
68
|
+
src_url_parsed = URI.parse(@conf["pdfs"]["src_folder"])
|
69
|
+
@conf["pdfs"]["src_path"] = src_url_parsed.path
|
70
|
+
@conf["pdfs"]["src_host"] = src_url_parsed.host
|
71
|
+
@log.info "Start authentication"
|
72
|
+
authenticate
|
73
|
+
@log.info "Authentication completed"
|
74
|
+
end
|
75
|
+
|
76
|
+
def self.get_service name, conf, auth_conf={}, log_level = Logger::WARN, url_type_cache = {}
|
77
|
+
service = @@service_classes[conf["type"]]
|
78
|
+
if service == nil
|
79
|
+
raise "Unknown service #{conf["type"]}"
|
80
|
+
else
|
81
|
+
service["class"].new name, conf, auth_conf, log_level, url_type_cache
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
def authenticate
|
86
|
+
""
|
87
|
+
end
|
88
|
+
|
89
|
+
def execute
|
90
|
+
@log.info "Start grawling #{@conf["entry_url"]}"
|
91
|
+
parse_html_page @conf["entry_url"]
|
92
|
+
@log.info "Completed grawling #{@conf["entry_url"]}"
|
93
|
+
end
|
94
|
+
|
95
|
+
def parse_html_page url, url_cache = Set.new
|
96
|
+
url = url_chomp url
|
97
|
+
return if url_cache.member?(url)
|
98
|
+
url_cache.add url
|
99
|
+
@log.info "Fetch and parse #{url}"
|
100
|
+
html = ""
|
101
|
+
begin
|
102
|
+
html = fetch_url url
|
103
|
+
access_pause_sleep
|
104
|
+
rescue => ex
|
105
|
+
@log.error "Cannot fetch #{url}"
|
106
|
+
@log.error ex
|
107
|
+
return
|
108
|
+
end
|
109
|
+
links = parse_html url, html
|
110
|
+
links["html"].each do |html_link|
|
111
|
+
parse_html_page html_link, url_cache
|
112
|
+
end
|
113
|
+
links["pdf"].each do |pdf_link|
|
114
|
+
process_pdf pdf_link
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
##
|
119
|
+
#Executes curl to fetch the requested url
|
120
|
+
#@param url requested url
|
121
|
+
#@param output_file output destination, if nil the output gets returned by
|
122
|
+
#this method
|
123
|
+
def fetch_url url, output_file=nil, curl_params=""
|
124
|
+
curl_params = "#{@auth_app} #{curl_params} --silent --user-agent \"#{@conf["user_agent"]}\""
|
125
|
+
curl_params += " -b #{@conf["cookie_jar"]} -c #{@conf["cookie_jar"]} -L -o \"#{output_file || "-"}\" #{url}"
|
126
|
+
@log.debug "Call curl on #{url}"
|
127
|
+
@log.debug "Curl parameters '#{curl_params}'"
|
128
|
+
`cd #{@base_dir}; curl #{curl_params}`
|
129
|
+
end
|
130
|
+
|
131
|
+
def post url, params, output_file=nil, curl_params=""
|
132
|
+
param_arr = []
|
133
|
+
params.each do |key, value|
|
134
|
+
param_arr << "#{CGI::escape(key)}=#{CGI::escape(value)}"
|
135
|
+
end
|
136
|
+
param = param_arr.join "&"
|
137
|
+
begin
|
138
|
+
fetch_url url, output_file, "#{curl_params} --data \"#{param}\""
|
139
|
+
rescue => ex
|
140
|
+
@log.error "Failed to POST #{url} with data #{params}"
|
141
|
+
@log.error ex
|
142
|
+
""
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
def parse_html url, html
|
147
|
+
doc = nil
|
148
|
+
links = {'pdf' => [], 'html' => []}
|
149
|
+
begin
|
150
|
+
doc = Nokogiri::HTML html
|
151
|
+
rescue => ex
|
152
|
+
@log.error "Parsing html from url #{url} failed"
|
153
|
+
return links
|
154
|
+
end
|
155
|
+
doc.css('a[href]').each do |link|
|
156
|
+
begin
|
157
|
+
link_url = url_chomp(URI.join(url, link.attributes["href"]).to_s).to_s
|
158
|
+
@log.debug "Process link #{link_url}"
|
159
|
+
if is_pdf_url link_url
|
160
|
+
links['pdf'] << link_url
|
161
|
+
@log.debug "#{link_url} is pdf"
|
162
|
+
elsif is_html_url link_url
|
163
|
+
links['html'] << link_url
|
164
|
+
@log.debug "#{link_url} is html"
|
165
|
+
end
|
166
|
+
rescue => ex
|
167
|
+
@log.debug "Omit #{link}"
|
168
|
+
end
|
169
|
+
end
|
170
|
+
return links
|
171
|
+
end
|
172
|
+
|
173
|
+
def get_field_value html, field
|
174
|
+
doc = nil
|
175
|
+
begin
|
176
|
+
doc = Nokogiri::HTML html
|
177
|
+
rescue => ex
|
178
|
+
@log.error "Parsing html failed"
|
179
|
+
@log.error ex
|
180
|
+
return ""
|
181
|
+
end
|
182
|
+
value = ""
|
183
|
+
doc.css("##{field}").each do |link|
|
184
|
+
value = link.attributes["value"].to_s
|
185
|
+
end
|
186
|
+
return value
|
187
|
+
end
|
188
|
+
|
189
|
+
def get_type url
|
190
|
+
if is_excluded url
|
191
|
+
return ""
|
192
|
+
end
|
193
|
+
if @type_cache[url] == nil
|
194
|
+
if url.upcase.end_with?(".PDF") ||
|
195
|
+
get_file_header(url)["Content-Type"].start_with?("application/pdf", "application/x-pdf")
|
196
|
+
@type_cache[url] = "pdf"
|
197
|
+
elsif get_file_header(url)["Content-Type"].start_with?("text/html")
|
198
|
+
@type_cache[url] = "html"
|
199
|
+
else
|
200
|
+
@type_cache[url] = ""
|
201
|
+
end
|
202
|
+
end
|
203
|
+
return @type_cache[url]
|
204
|
+
end
|
205
|
+
|
206
|
+
def is_pdf_url url
|
207
|
+
get_type(url) == "pdf"
|
208
|
+
end
|
209
|
+
|
210
|
+
def is_html_url url
|
211
|
+
get_type(url) == "html"
|
212
|
+
end
|
213
|
+
|
214
|
+
def is_excluded url
|
215
|
+
parsed_url = get_uri url
|
216
|
+
parsed_url.path.send(:start_with?, @conf["exclude_file_endings"]) ||
|
217
|
+
parsed_url.host != @conf["pdfs"]["src_host"] ||
|
218
|
+
!parsed_url.path.start_with?(@conf["pdfs"]["src_path"])
|
219
|
+
end
|
220
|
+
|
221
|
+
def access_pause_sleep
|
222
|
+
min = @conf["access_pause"]["min"]
|
223
|
+
max = @conf["access_pause"]["max"]
|
224
|
+
duration = Random.rand() * (max - min) + min
|
225
|
+
@log.debug "Sleep #{duration} seconds to behave a bit more human"
|
226
|
+
sleep duration
|
227
|
+
end
|
228
|
+
|
229
|
+
def get_file_header url
|
230
|
+
url = url_chomp url
|
231
|
+
if @file_header_cache[url] == nil
|
232
|
+
header = fetch_url url, "-", "-I"
|
233
|
+
lines = header.split("\r\n").map {|val| val.split(": ") }
|
234
|
+
response = {}
|
235
|
+
lines.each {|arr| response[arr[0]] = arr[1] }
|
236
|
+
@file_header_cache[url] = response
|
237
|
+
@log.info "Fetch header of #{url}"
|
238
|
+
access_pause_sleep
|
239
|
+
end
|
240
|
+
return @file_header_cache[url]
|
241
|
+
end
|
242
|
+
|
243
|
+
def get_path_url url
|
244
|
+
parsed = get_uri url
|
245
|
+
parsed.path + (parsed.query != "" ? "?#{parsed.query}": "")
|
246
|
+
end
|
247
|
+
|
248
|
+
def process_pdf url
|
249
|
+
url = url_chomp url
|
250
|
+
return unless @processed_pdfs[url] == nil
|
251
|
+
@log.info "Process pdf #{url}"
|
252
|
+
dest = get_dest_path url
|
253
|
+
if not @conf["pdfs"]["download_once"]
|
254
|
+
header_date = get_file_header(url)["Last-Modified"]
|
255
|
+
header_time = header_date != nil ? Time.parse(header_date).to_i : Time.now.to_i
|
256
|
+
file_time = File.exists?(dest) ? File.mtime(dest).to_i : 0
|
257
|
+
@log.info "Process pdf #{url} with mtime #{header_time}, file mtime #{file_time}"
|
258
|
+
if file_time >= header_time
|
259
|
+
@log.info "Destination file #{dest} isn't younger => no download"
|
260
|
+
return
|
261
|
+
end
|
262
|
+
elsif File.exists? dest
|
263
|
+
@log.info "Destination file exists => no download"
|
264
|
+
return
|
265
|
+
end
|
266
|
+
`mkdir -p "#{File.dirname(dest)}"` unless File.exists? File.dirname(dest)
|
267
|
+
@log.info "Destination file #{dest} is older => download"
|
268
|
+
begin
|
269
|
+
@log.debug(fetch_url url, dest)
|
270
|
+
rescue => ex
|
271
|
+
@log.error "Downloading #{url} failed"
|
272
|
+
@log.error ex
|
273
|
+
end
|
274
|
+
@processed_pdfs[url] = dest
|
275
|
+
access_pause_sleep
|
276
|
+
end
|
277
|
+
|
278
|
+
def get_dest_path url
|
279
|
+
url_path = get_uri(url).path
|
280
|
+
src_path = @conf["pdfs"]["src_path"]
|
281
|
+
dest_folder = @conf["pdfs"]["dest_folder"]
|
282
|
+
dest_folder + "/" + url_path.slice(src_path.length, url_path.length - src_path.length)
|
283
|
+
end
|
284
|
+
|
285
|
+
def self.add_service_class name, description, service_class, needs_auth = true, url_regex = nil
|
286
|
+
@@service_classes[name] = {
|
287
|
+
"class" => service_class,
|
288
|
+
"url_regex" => url_regex,
|
289
|
+
"description" => description,
|
290
|
+
"needs_auth" => needs_auth
|
291
|
+
}
|
292
|
+
end
|
293
|
+
|
294
|
+
def get_uri url
|
295
|
+
if @uri_cache[url] == nil
|
296
|
+
@uri_cache[url] = URI.parse url
|
297
|
+
end
|
298
|
+
return @uri_cache[url]
|
299
|
+
end
|
300
|
+
|
301
|
+
def url_chomp url
|
302
|
+
uri = get_uri url
|
303
|
+
uri.scheme + "://" + uri.host + uri.path + (uri.query != nil ? "?#{uri.query}" : "")
|
304
|
+
end
|
305
|
+
|
306
|
+
def self.get_services
|
307
|
+
@@service_classes.clone
|
308
|
+
end
|
309
|
+
|
310
|
+
def self.get_service_for_url url
|
311
|
+
@@service_classes.each do |name, service|
|
312
|
+
unless service["url_regex"] == nil && service["url_regex"] =~ url
|
313
|
+
return name
|
314
|
+
end
|
315
|
+
end
|
316
|
+
return "base"
|
317
|
+
end
|
318
|
+
|
319
|
+
self.add_service_class "base", "without any authentication", self, false, nil
|
320
|
+
|
321
|
+
end
|
322
|
+
|
323
|
+
class SecuredService < BaseService
|
324
|
+
|
325
|
+
def authenticate
|
326
|
+
unless @conf["auth"] != nil && @conf["auth"]["user"] != nil && @conf["auth"]["pass"]
|
327
|
+
raise "No authentication (user name and password) given"
|
328
|
+
end
|
329
|
+
_authenticate
|
330
|
+
end
|
331
|
+
|
332
|
+
def _authenticate
|
333
|
+
end
|
334
|
+
|
335
|
+
end
|
336
|
+
|
337
|
+
class StudiumKITService < SecuredService
|
338
|
+
|
339
|
+
def _authenticate
|
340
|
+
login_url = "https://studium.kit.edu/_layouts/login.aspx?ReturnUrl=%2f"
|
341
|
+
html = nil
|
342
|
+
post_html = nil
|
343
|
+
begin
|
344
|
+
html = fetch_url login_url
|
345
|
+
params = {
|
346
|
+
"ctl00$PlaceHolderMain$Login$UserName" => @conf["auth"]["user"],
|
347
|
+
"ctl00$PlaceHolderMain$Login$password" => @conf["auth"]["pass"],
|
348
|
+
"ctl00$PlaceHolderMain$Login$loginbutton" => "Anmelden",
|
349
|
+
"__VIEWSTATE" => get_field_value(html, "__VIEWSTATE"),
|
350
|
+
"__EVENTVALIDATION" => get_field_value(html, "__EVENTVALIDATION"),
|
351
|
+
"__spDummyText1" => "",
|
352
|
+
"__spDummyText2" => ""
|
353
|
+
}
|
354
|
+
rescue => ex
|
355
|
+
@log.fatal ex
|
356
|
+
raise "Fetching and parsing login page failed"
|
357
|
+
end
|
358
|
+
begin
|
359
|
+
post_html = post login_url, params
|
360
|
+
rescue => ex
|
361
|
+
@log.fatal ex
|
362
|
+
raise "POST request to login page failed"
|
363
|
+
end
|
364
|
+
if post_html == html
|
365
|
+
raise "Authentication failed, wrong user name or password"
|
366
|
+
end
|
367
|
+
end
|
368
|
+
|
369
|
+
self.add_service_class "studium_kit", "studium.kit.edu service", self, true, /studium\.kit\.edu/
|
370
|
+
|
371
|
+
end
|
372
|
+
|
373
|
+
class HTTPAuthService < SecuredService
|
374
|
+
|
375
|
+
def _authenticate
|
376
|
+
@auth_app = "-u #{URI::escape @conf["auth"]["user"]}:#{URI::escape @conf["auth"]["pass"]}"
|
377
|
+
end
|
378
|
+
|
379
|
+
self.add_service_class "http_auth", "http authenticated service", self, true, nil
|
380
|
+
|
381
|
+
end
|
metadata
ADDED
@@ -0,0 +1,120 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: kitcrawler
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Johannes Bechberger
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2014-07-12 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: nokogiri
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: 1.6.1
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: 1.6.1
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: highline
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: 1.6.0
|
38
|
+
type: :runtime
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: 1.6.0
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: thor
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: 0.19.0
|
54
|
+
type: :runtime
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: 0.19.0
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: damerau-levenshtein
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ! '>='
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: 1.0.0
|
70
|
+
type: :runtime
|
71
|
+
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ! '>='
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: 1.0.0
|
78
|
+
description: ! "\tCrawl lecture websites and fetch the PDFs automatically. \n\tIt
|
79
|
+
currently supports the studium.kit.edu and other HTTP password protected sites.\n"
|
80
|
+
email: me@mostlynerdless.de
|
81
|
+
executables:
|
82
|
+
- kitcrawler
|
83
|
+
extensions: []
|
84
|
+
extra_rdoc_files:
|
85
|
+
- README.md
|
86
|
+
files:
|
87
|
+
- lib/kitcrawler.rb
|
88
|
+
- lib/services.rb
|
89
|
+
- lib/cli.rb
|
90
|
+
- lib/cli_add.rb
|
91
|
+
- README.md
|
92
|
+
- bin/kitcrawler
|
93
|
+
homepage: https://github.com/parttimenerd/KITCrawler
|
94
|
+
licenses:
|
95
|
+
- GPL v3
|
96
|
+
post_install_message:
|
97
|
+
rdoc_options: []
|
98
|
+
require_paths:
|
99
|
+
- lib
|
100
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
101
|
+
none: false
|
102
|
+
requirements:
|
103
|
+
- - ! '>='
|
104
|
+
- !ruby/object:Gem::Version
|
105
|
+
version: 1.8.6
|
106
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
107
|
+
none: false
|
108
|
+
requirements:
|
109
|
+
- - ! '>='
|
110
|
+
- !ruby/object:Gem::Version
|
111
|
+
version: '0'
|
112
|
+
requirements:
|
113
|
+
- Linux (other UNIXes might also work)
|
114
|
+
- curl
|
115
|
+
rubyforge_project:
|
116
|
+
rubygems_version: 1.8.23
|
117
|
+
signing_key:
|
118
|
+
specification_version: 3
|
119
|
+
summary: Fetch lecture PDFs with ease
|
120
|
+
test_files: []
|