kitcrawler 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +48 -0
- data/bin/kitcrawler +5 -0
- data/lib/cli.rb +162 -0
- data/lib/cli_add.rb +151 -0
- data/lib/kitcrawler.rb +16 -0
- data/lib/services.rb +381 -0
- metadata +120 -0
data/README.md
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
KITCrawler
|
2
|
+
===================
|
3
|
+
Fetch lecture PDFs with ease.
|
4
|
+
|
5
|
+
|
6
|
+
It currently supports crawling PDFs for lectures from the studium.kit.edu page,
|
7
|
+
but can be easily extended to fetch PDFs from other services.
|
8
|
+
|
9
|
+
Requirements
|
10
|
+
-------------------
|
11
|
+
- ruby (>= 1.9, but 1.8 might also be okay)
|
12
|
+
- bundler (or install the required gems (see `Gemfile`) manually)
|
13
|
+
- linux (with curl, might also work on other Unixes)
|
14
|
+
|
15
|
+
Install
|
16
|
+
-------------------
|
17
|
+
Simply run
|
18
|
+
```
|
19
|
+
gem install kitcrawler
|
20
|
+
```
|
21
|
+
to install the gem (it's often a bit behind the repo).
|
22
|
+
|
23
|
+
Or run it from source.
|
24
|
+
```
|
25
|
+
git clone https://github.com/parttimenerd/KITCrawler
|
26
|
+
cd KITCrawler
|
27
|
+
bundle install
|
28
|
+
``
|
29
|
+
|
30
|
+
|
31
|
+
Usage
|
32
|
+
-------------------
|
33
|
+
Run
|
34
|
+
```
|
35
|
+
kitcrawler add NAME
|
36
|
+
```
|
37
|
+
to add a new fetch job named `NAME`. This will prompt you to pass an entry URL to the site, etc.
|
38
|
+
|
39
|
+
To finally run your jobs use
|
40
|
+
```
|
41
|
+
kitcrawler fetch
|
42
|
+
```
|
43
|
+
|
44
|
+
It also supports some command line parameters, run `kitcrawler` to see an explanation.
|
45
|
+
|
46
|
+
License
|
47
|
+
-------------------
|
48
|
+
The code is GNU GPL v3 licensed.
|
data/bin/kitcrawler
ADDED
data/lib/cli.rb
ADDED
@@ -0,0 +1,162 @@
|
|
1
|
+
require 'pp'
|
2
|
+
require 'json'
|
3
|
+
require 'thor'
|
4
|
+
require_relative 'services.rb'
|
5
|
+
require_relative 'cli_add.rb'
|
6
|
+
|
7
|
+
def kitgrawler_dir
|
8
|
+
`echo ~/.kitcrawler`.strip
|
9
|
+
end
|
10
|
+
|
11
|
+
class CLI < Thor
|
12
|
+
class_option :config_file, :type => :string, :aliases => "-c",
|
13
|
+
:default => "#{kitgrawler_dir}/config.json",
|
14
|
+
:desc => "Use CONFIG_FILE as config file location"
|
15
|
+
class_option :auth_file, :type => :string, :aliases => "-a",
|
16
|
+
:default => "#{kitgrawler_dir}/auth.json",
|
17
|
+
:desc => "Use AUTH_FILE as authentication location"
|
18
|
+
class_option :url_cache_file, :type => :string, :aliases => "-u",
|
19
|
+
:default => "#{kitgrawler_dir}/url_type_cache.json",
|
20
|
+
:desc => "Use URL_CACHE_FILE as url cache file location"
|
21
|
+
class_option :debug, :type => :boolean, :default => false, :aliases => "-d",
|
22
|
+
:desc => "Print everything to standard out"
|
23
|
+
class_option :verbose, :type => :boolean, :default => false, :aliases => "-v",
|
24
|
+
:desc => "Print a lot to standard out"
|
25
|
+
class_option :warn, :type => :boolean, :default => false, :aliases => "-w",
|
26
|
+
:desc => "Print only warnings and errors to standard out"
|
27
|
+
class_option :quiet, :type => :boolean, :default => false, :aliases => "-q",
|
28
|
+
:desc => "Print nothing to standard out"
|
29
|
+
|
30
|
+
desc "fetch [NAME]", "Run fetch job NAME or all jobs"
|
31
|
+
def fetch name
|
32
|
+
if name == nil
|
33
|
+
CLIHelper.new(options).fetch_all
|
34
|
+
else
|
35
|
+
CLIHelper.new(options).fetch name
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
desc "add NAME", "Add new job NAME"
|
40
|
+
def add name
|
41
|
+
CLI_ADD.add_config_ui name, options
|
42
|
+
end
|
43
|
+
|
44
|
+
end
|
45
|
+
|
46
|
+
class CLIHelper
|
47
|
+
|
48
|
+
def initialize options
|
49
|
+
@options = options
|
50
|
+
`mkdir -p #{kitgrawler_dir}`
|
51
|
+
init_logger
|
52
|
+
load_conf_file
|
53
|
+
load_auth_file
|
54
|
+
load_url_type_cache_file
|
55
|
+
end
|
56
|
+
|
57
|
+
def init_logger
|
58
|
+
@log = Logger.new STDOUT
|
59
|
+
@log_level = Logger::WARN
|
60
|
+
@log_level = Logger::DEBUG if @options[:debug]
|
61
|
+
@log_level = Logger::INFO if @options[:verbose]
|
62
|
+
@log_level = Logger::WARN if @options[:warn]
|
63
|
+
@log_level = Logger::UNKOWN if @options[:quiet]
|
64
|
+
@log.level = @log_level
|
65
|
+
@log.progname = "cli"
|
66
|
+
end
|
67
|
+
|
68
|
+
def load_conf_file
|
69
|
+
@conf = {}
|
70
|
+
return unless File.exists?(@options[:config_file])
|
71
|
+
begin
|
72
|
+
@conf = JSON.load File.read(@options[:config_file])
|
73
|
+
rescue => ex
|
74
|
+
@log.fatal "Cannot load config file #{@options[:config_file]}"
|
75
|
+
@log.fatal ex
|
76
|
+
exit 1
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
def load_auth_file
|
81
|
+
@auth_conf = {}
|
82
|
+
return unless File.exists?(@options[:auth_file])
|
83
|
+
begin
|
84
|
+
@auth_conf = JSON.load File.read(@options[:auth_file])
|
85
|
+
rescue => ex
|
86
|
+
@log.fatal "Cannot load authentication config file #{@options[:auth_file]}"
|
87
|
+
@log.fatal ex
|
88
|
+
exit 1
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
def load_url_type_cache_file
|
93
|
+
@url_type_cache = {}
|
94
|
+
return unless File.exists?(@options[:url_cache_file])
|
95
|
+
begin
|
96
|
+
@url_type_cache = JSON.load File.read(@options[:url_cache_file])
|
97
|
+
rescue => ex
|
98
|
+
@log.fatal "Cannot load url type cache file #{@options[:url_cache_file]}"
|
99
|
+
@log.fatal ex
|
100
|
+
exit 1
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
def fetch_all
|
105
|
+
begin
|
106
|
+
@conf.each_key do |grawl_job|
|
107
|
+
fetch grawl_job
|
108
|
+
end
|
109
|
+
rescue => ex
|
110
|
+
@log.fatal "Error grawling configured locations"
|
111
|
+
@log.fatal ex
|
112
|
+
exit 1
|
113
|
+
ensure
|
114
|
+
File.open(@options[:url_cache_file], "w") do |f|
|
115
|
+
f.puts JSON::pretty_generate @url_type_cache
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
def fetch job_name
|
121
|
+
grawl_location = job_name
|
122
|
+
conf = @conf[job_name]
|
123
|
+
if conf == nil
|
124
|
+
print_job_name_guess job_name
|
125
|
+
return
|
126
|
+
end
|
127
|
+
begin
|
128
|
+
service = BaseService::get_service grawl_location, conf, @auth_conf, @log_level, @url_type_cache
|
129
|
+
begin
|
130
|
+
service.execute
|
131
|
+
rescue => ex
|
132
|
+
@log.error "Failed executing #{grawl_location}"
|
133
|
+
@log.error ex
|
134
|
+
end
|
135
|
+
rescue => ex
|
136
|
+
@log.error "Failed to instantiate #{grawl_location}"
|
137
|
+
@log.error ex
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
def print_job_name_guess job_name
|
142
|
+
unless @options[:quiet]
|
143
|
+
puts "There is no job '#{job_name}'."
|
144
|
+
puts "Maybe you meant one of the following"
|
145
|
+
best_n_matches(@conf.keys, job_name, 3).each do |name|
|
146
|
+
puts " #{name}"
|
147
|
+
end
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
def best_n_matches arr, comp, n
|
152
|
+
require 'damerau-levenshtein'
|
153
|
+
map = {}
|
154
|
+
dl = DamerauLevenshtein
|
155
|
+
arr.each do |str|
|
156
|
+
map[str] = dl.distance(str, comp, 2)
|
157
|
+
end
|
158
|
+
return arr.sort {|a, b| map[a] <=> map[b] }
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
|
data/lib/cli_add.rb
ADDED
@@ -0,0 +1,151 @@
|
|
1
|
+
require 'pp'
|
2
|
+
require 'json'
|
3
|
+
require_relative 'services.rb'
|
4
|
+
|
5
|
+
module CLI_ADD
|
6
|
+
|
7
|
+
require 'highline/import'
|
8
|
+
|
9
|
+
def self.add_config_ui name, options
|
10
|
+
CLIHelper.new options
|
11
|
+
config_json = JSON.load File.read(options[:config_file])
|
12
|
+
auth_json = JSON.load File.read(options[:auth_file])
|
13
|
+
say("Configure #{name}")
|
14
|
+
name = check_name name, config_json
|
15
|
+
conf = {}
|
16
|
+
conf["entry_url"] = ask_entry_url
|
17
|
+
conf["type"] = ask_type conf
|
18
|
+
conf["pdfs"] = ask_pdfs conf
|
19
|
+
if BaseService.get_services["needs_auth"]
|
20
|
+
conf["auth"] = ask_auth name, conf, auth_json
|
21
|
+
end
|
22
|
+
config_json[name] = conf
|
23
|
+
say "This configuration is placed into your config files."
|
24
|
+
say "Your config file is #{options[:config_file]}."
|
25
|
+
say "Your authentication config file is #{options[:auth_file]}."
|
26
|
+
File.open(options[:config_file], "w") do |f|
|
27
|
+
f.puts JSON.pretty_generate config_json
|
28
|
+
end
|
29
|
+
File.open(options[:auth_file], "w") do |f|
|
30
|
+
f.puts JSON.pretty_generate auth_json
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def self.check_name name, config_json
|
35
|
+
names = config_json.keys
|
36
|
+
while names.include? name
|
37
|
+
name = ask "Fetch job name (#{name} is already in use)? "
|
38
|
+
end
|
39
|
+
return name
|
40
|
+
end
|
41
|
+
|
42
|
+
def self.ask_entry_url
|
43
|
+
return ask_url "Entry point url? "
|
44
|
+
end
|
45
|
+
|
46
|
+
def self.ask_type conf
|
47
|
+
default = BaseService::get_service_for_url conf["entry_url"]
|
48
|
+
choose do |menu|
|
49
|
+
menu.prompt = "Service type [#{default}]? "
|
50
|
+
menu.default = default
|
51
|
+
BaseService::get_services.each do |name, service|
|
52
|
+
menu.choices("#{name} (#{service["description"]})") do |q|
|
53
|
+
say "You've choosen '#{name}'."
|
54
|
+
return name
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def self.ask_pdfs conf
|
61
|
+
return {
|
62
|
+
"src_folder" => ask_non_empty("Source folder url (relative to entry url directory if starts with dot)? "),
|
63
|
+
"dest_folder" => ask_non_empty("Destination folder (relative to $HOME)? "),
|
64
|
+
"download_once" => ask_yes_no(
|
65
|
+
"Dowload a PDF only once (ignore changes, boost performance)? ", "yes"
|
66
|
+
) == "yes"
|
67
|
+
}
|
68
|
+
end
|
69
|
+
|
70
|
+
def self.ask_auth name, conf, auth_json
|
71
|
+
is_studium_kit = conf["type"] == "studium_kit"
|
72
|
+
has_s_kit_auth = auth_json["studium_kit"] != nil
|
73
|
+
has_name_auth = auth_json[name] != nil
|
74
|
+
default = ""
|
75
|
+
if ask_yes_no("Auth: Use existing user/password configuration? ", is_studium_kit && has_s_kit_auth ? "yes" : "no") == "yes"
|
76
|
+
if is_studium_kit && has_s_kit_auth
|
77
|
+
default = "studium_kit"
|
78
|
+
end
|
79
|
+
choose do |menu|
|
80
|
+
menu.prompt = "Auth: Which configuration? "
|
81
|
+
menu.default = default unless default.empty?
|
82
|
+
auth_json.each do |name, config|
|
83
|
+
menu.choices("#{name} (user: #{config["user"]})") do |q|
|
84
|
+
say "Auth: You've chosen '#{name}'."
|
85
|
+
return name
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
else
|
90
|
+
auths = auth_json.keys
|
91
|
+
auth_name = ""
|
92
|
+
if is_studium_kit && !has_s_kit_auth
|
93
|
+
default = "studium_kit"
|
94
|
+
auth_name = ask "Auth: Configuration name [studium_kit]? " do |q|
|
95
|
+
q.default = studium_kit
|
96
|
+
end
|
97
|
+
elsif not has_name_auth
|
98
|
+
auth_name = ask "Auth: Configuration name [#{name}]? " do |q|
|
99
|
+
q.default = name
|
100
|
+
end
|
101
|
+
else
|
102
|
+
auth_name = ask_non_empty "Auth: Configuration name? "
|
103
|
+
end
|
104
|
+
begin
|
105
|
+
auth_json[auth_name] = ask_user_pass
|
106
|
+
end while ask_yes_no("Auth: Confirm that you're credentials are right. Are they? ", "yes") == "no"
|
107
|
+
return auth_name
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
def self.ask_user_pass
|
112
|
+
user = ask "Auth: User name? "
|
113
|
+
pass = ""
|
114
|
+
pass2 = ""
|
115
|
+
begin
|
116
|
+
pass = ask("Auth: Password? ") { |q| q.echo = "x" }
|
117
|
+
pass2 = ask("Auth: Retype it ") { |q| q.echo = "x" }
|
118
|
+
end while pass != pass2
|
119
|
+
return {
|
120
|
+
"user" => user,
|
121
|
+
"pass" => pass
|
122
|
+
}
|
123
|
+
end
|
124
|
+
|
125
|
+
def self.ask_url question
|
126
|
+
str = ""
|
127
|
+
while str.strip.length < 4
|
128
|
+
str = ask(question) || ""
|
129
|
+
end
|
130
|
+
return str.strip
|
131
|
+
end
|
132
|
+
|
133
|
+
def self.ask_non_empty question
|
134
|
+
str = ""
|
135
|
+
while str.strip.empty?
|
136
|
+
str = ask(question) || ""
|
137
|
+
end
|
138
|
+
return str.strip
|
139
|
+
end
|
140
|
+
|
141
|
+
def self.ask_yes_no question, default = "yes"
|
142
|
+
choose do |menu|
|
143
|
+
menu.layout = :one_line
|
144
|
+
menu.prompt = "#{question} [#{default}] "
|
145
|
+
menu.default = default
|
146
|
+
menu.choices(:yes, :no) do |q|
|
147
|
+
return q.to_s
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|
151
|
+
end
|
data/lib/kitcrawler.rb
ADDED
data/lib/services.rb
ADDED
@@ -0,0 +1,381 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'cgi'
|
3
|
+
require 'pp'
|
4
|
+
require 'uri'
|
5
|
+
require 'logger'
|
6
|
+
require 'set'
|
7
|
+
require 'time'
|
8
|
+
require 'uri'
|
9
|
+
require 'json'
|
10
|
+
|
11
|
+
class BaseService
|
12
|
+
|
13
|
+
@conf = {}
|
14
|
+
@auth_app = ""
|
15
|
+
@file_header_cache = {} #url => splitted HTTP header lines
|
16
|
+
@processed_pdfs = {} #url => dest file
|
17
|
+
@name = ""
|
18
|
+
@@log
|
19
|
+
@@service_classes = {}
|
20
|
+
@type_cache = {}
|
21
|
+
@uri_cache = {} #url => URI
|
22
|
+
|
23
|
+
def initialize name, conf, auth_conf = {}, log_level = Logger::WARN, url_type_cache = {}
|
24
|
+
@base_dir = `echo ~`.strip
|
25
|
+
@uri_cache = {}
|
26
|
+
@file_header_cache = {}
|
27
|
+
@type_cache = url_type_cache
|
28
|
+
@processed_pdfs = {}
|
29
|
+
@name = name
|
30
|
+
@log = Logger.new(STDOUT)
|
31
|
+
@log.progname = name
|
32
|
+
@log.level = log_level
|
33
|
+
|
34
|
+
@conf = {
|
35
|
+
"type" => "base",
|
36
|
+
"exclude_file_endings" => [".css", ".js", ".txt", ".rss", ".atom"],
|
37
|
+
"access_pause" => { #in seconds
|
38
|
+
"min" => 0.1,
|
39
|
+
"max" => 0.3
|
40
|
+
},
|
41
|
+
"pdfs" => {
|
42
|
+
"src_folder" => "abc.de/a", #is relative to entry_url base dir if starts with dot
|
43
|
+
"dest_folder" => "abcd",
|
44
|
+
"download_once" => true
|
45
|
+
},
|
46
|
+
"cookie_jar" => "cookies.txt",
|
47
|
+
"user_agent" => "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:30.0) Gecko/20100101 Firefox/30.0",
|
48
|
+
"entry_url" => "",
|
49
|
+
"auth" => "base" #references auth conf or {"user" => "", "pass" => ""}
|
50
|
+
}
|
51
|
+
temp_conf = @conf.merge conf
|
52
|
+
unless conf["pdfs"] == nil
|
53
|
+
temp_conf["pdfs"] = @conf["pdfs"].merge conf["pdfs"]
|
54
|
+
end
|
55
|
+
@conf = temp_conf
|
56
|
+
if @conf["auth"].is_a? String
|
57
|
+
@conf["auth"] = auth_conf[@conf["auth"]]
|
58
|
+
@log.debug "Load auth from auth config #{auth_conf}"
|
59
|
+
end
|
60
|
+
|
61
|
+
if @conf["pdfs"]["src_folder"].start_with? "."
|
62
|
+
entry_uri = get_uri @conf["entry_url"]
|
63
|
+
entry_path_url = entry_uri.scheme + "://" + entry_uri.host + File.dirname(entry_uri.path)
|
64
|
+
@conf["pdfs"]["src_folder"] = "#{entry_path_url}/#{@conf["pdfs"]["src_folder"]}"
|
65
|
+
@log.info "Source folder is #{@conf["pdfs"]["src_folder"]}"
|
66
|
+
end
|
67
|
+
|
68
|
+
src_url_parsed = URI.parse(@conf["pdfs"]["src_folder"])
|
69
|
+
@conf["pdfs"]["src_path"] = src_url_parsed.path
|
70
|
+
@conf["pdfs"]["src_host"] = src_url_parsed.host
|
71
|
+
@log.info "Start authentication"
|
72
|
+
authenticate
|
73
|
+
@log.info "Authentication completed"
|
74
|
+
end
|
75
|
+
|
76
|
+
def self.get_service name, conf, auth_conf={}, log_level = Logger::WARN, url_type_cache = {}
|
77
|
+
service = @@service_classes[conf["type"]]
|
78
|
+
if service == nil
|
79
|
+
raise "Unknown service #{conf["type"]}"
|
80
|
+
else
|
81
|
+
service["class"].new name, conf, auth_conf, log_level, url_type_cache
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
def authenticate
|
86
|
+
""
|
87
|
+
end
|
88
|
+
|
89
|
+
def execute
|
90
|
+
@log.info "Start grawling #{@conf["entry_url"]}"
|
91
|
+
parse_html_page @conf["entry_url"]
|
92
|
+
@log.info "Completed grawling #{@conf["entry_url"]}"
|
93
|
+
end
|
94
|
+
|
95
|
+
def parse_html_page url, url_cache = Set.new
|
96
|
+
url = url_chomp url
|
97
|
+
return if url_cache.member?(url)
|
98
|
+
url_cache.add url
|
99
|
+
@log.info "Fetch and parse #{url}"
|
100
|
+
html = ""
|
101
|
+
begin
|
102
|
+
html = fetch_url url
|
103
|
+
access_pause_sleep
|
104
|
+
rescue => ex
|
105
|
+
@log.error "Cannot fetch #{url}"
|
106
|
+
@log.error ex
|
107
|
+
return
|
108
|
+
end
|
109
|
+
links = parse_html url, html
|
110
|
+
links["html"].each do |html_link|
|
111
|
+
parse_html_page html_link, url_cache
|
112
|
+
end
|
113
|
+
links["pdf"].each do |pdf_link|
|
114
|
+
process_pdf pdf_link
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
##
|
119
|
+
#Executes curl to fetch the requested url
|
120
|
+
#@param url requested url
|
121
|
+
#@param output_file output destination, if nil the output gets returned by
|
122
|
+
#this method
|
123
|
+
def fetch_url url, output_file=nil, curl_params=""
|
124
|
+
curl_params = "#{@auth_app} #{curl_params} --silent --user-agent \"#{@conf["user_agent"]}\""
|
125
|
+
curl_params += " -b #{@conf["cookie_jar"]} -c #{@conf["cookie_jar"]} -L -o \"#{output_file || "-"}\" #{url}"
|
126
|
+
@log.debug "Call curl on #{url}"
|
127
|
+
@log.debug "Curl parameters '#{curl_params}'"
|
128
|
+
`cd #{@base_dir}; curl #{curl_params}`
|
129
|
+
end
|
130
|
+
|
131
|
+
def post url, params, output_file=nil, curl_params=""
|
132
|
+
param_arr = []
|
133
|
+
params.each do |key, value|
|
134
|
+
param_arr << "#{CGI::escape(key)}=#{CGI::escape(value)}"
|
135
|
+
end
|
136
|
+
param = param_arr.join "&"
|
137
|
+
begin
|
138
|
+
fetch_url url, output_file, "#{curl_params} --data \"#{param}\""
|
139
|
+
rescue => ex
|
140
|
+
@log.error "Failed to POST #{url} with data #{params}"
|
141
|
+
@log.error ex
|
142
|
+
""
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
def parse_html url, html
|
147
|
+
doc = nil
|
148
|
+
links = {'pdf' => [], 'html' => []}
|
149
|
+
begin
|
150
|
+
doc = Nokogiri::HTML html
|
151
|
+
rescue => ex
|
152
|
+
@log.error "Parsing html from url #{url} failed"
|
153
|
+
return links
|
154
|
+
end
|
155
|
+
doc.css('a[href]').each do |link|
|
156
|
+
begin
|
157
|
+
link_url = url_chomp(URI.join(url, link.attributes["href"]).to_s).to_s
|
158
|
+
@log.debug "Process link #{link_url}"
|
159
|
+
if is_pdf_url link_url
|
160
|
+
links['pdf'] << link_url
|
161
|
+
@log.debug "#{link_url} is pdf"
|
162
|
+
elsif is_html_url link_url
|
163
|
+
links['html'] << link_url
|
164
|
+
@log.debug "#{link_url} is html"
|
165
|
+
end
|
166
|
+
rescue => ex
|
167
|
+
@log.debug "Omit #{link}"
|
168
|
+
end
|
169
|
+
end
|
170
|
+
return links
|
171
|
+
end
|
172
|
+
|
173
|
+
def get_field_value html, field
|
174
|
+
doc = nil
|
175
|
+
begin
|
176
|
+
doc = Nokogiri::HTML html
|
177
|
+
rescue => ex
|
178
|
+
@log.error "Parsing html failed"
|
179
|
+
@log.error ex
|
180
|
+
return ""
|
181
|
+
end
|
182
|
+
value = ""
|
183
|
+
doc.css("##{field}").each do |link|
|
184
|
+
value = link.attributes["value"].to_s
|
185
|
+
end
|
186
|
+
return value
|
187
|
+
end
|
188
|
+
|
189
|
+
def get_type url
|
190
|
+
if is_excluded url
|
191
|
+
return ""
|
192
|
+
end
|
193
|
+
if @type_cache[url] == nil
|
194
|
+
if url.upcase.end_with?(".PDF") ||
|
195
|
+
get_file_header(url)["Content-Type"].start_with?("application/pdf", "application/x-pdf")
|
196
|
+
@type_cache[url] = "pdf"
|
197
|
+
elsif get_file_header(url)["Content-Type"].start_with?("text/html")
|
198
|
+
@type_cache[url] = "html"
|
199
|
+
else
|
200
|
+
@type_cache[url] = ""
|
201
|
+
end
|
202
|
+
end
|
203
|
+
return @type_cache[url]
|
204
|
+
end
|
205
|
+
|
206
|
+
def is_pdf_url url
|
207
|
+
get_type(url) == "pdf"
|
208
|
+
end
|
209
|
+
|
210
|
+
def is_html_url url
|
211
|
+
get_type(url) == "html"
|
212
|
+
end
|
213
|
+
|
214
|
+
def is_excluded url
|
215
|
+
parsed_url = get_uri url
|
216
|
+
parsed_url.path.send(:start_with?, @conf["exclude_file_endings"]) ||
|
217
|
+
parsed_url.host != @conf["pdfs"]["src_host"] ||
|
218
|
+
!parsed_url.path.start_with?(@conf["pdfs"]["src_path"])
|
219
|
+
end
|
220
|
+
|
221
|
+
def access_pause_sleep
|
222
|
+
min = @conf["access_pause"]["min"]
|
223
|
+
max = @conf["access_pause"]["max"]
|
224
|
+
duration = Random.rand() * (max - min) + min
|
225
|
+
@log.debug "Sleep #{duration} seconds to behave a bit more human"
|
226
|
+
sleep duration
|
227
|
+
end
|
228
|
+
|
229
|
+
def get_file_header url
|
230
|
+
url = url_chomp url
|
231
|
+
if @file_header_cache[url] == nil
|
232
|
+
header = fetch_url url, "-", "-I"
|
233
|
+
lines = header.split("\r\n").map {|val| val.split(": ") }
|
234
|
+
response = {}
|
235
|
+
lines.each {|arr| response[arr[0]] = arr[1] }
|
236
|
+
@file_header_cache[url] = response
|
237
|
+
@log.info "Fetch header of #{url}"
|
238
|
+
access_pause_sleep
|
239
|
+
end
|
240
|
+
return @file_header_cache[url]
|
241
|
+
end
|
242
|
+
|
243
|
+
def get_path_url url
|
244
|
+
parsed = get_uri url
|
245
|
+
parsed.path + (parsed.query != "" ? "?#{parsed.query}": "")
|
246
|
+
end
|
247
|
+
|
248
|
+
def process_pdf url
|
249
|
+
url = url_chomp url
|
250
|
+
return unless @processed_pdfs[url] == nil
|
251
|
+
@log.info "Process pdf #{url}"
|
252
|
+
dest = get_dest_path url
|
253
|
+
if not @conf["pdfs"]["download_once"]
|
254
|
+
header_date = get_file_header(url)["Last-Modified"]
|
255
|
+
header_time = header_date != nil ? Time.parse(header_date).to_i : Time.now.to_i
|
256
|
+
file_time = File.exists?(dest) ? File.mtime(dest).to_i : 0
|
257
|
+
@log.info "Process pdf #{url} with mtime #{header_time}, file mtime #{file_time}"
|
258
|
+
if file_time >= header_time
|
259
|
+
@log.info "Destination file #{dest} isn't younger => no download"
|
260
|
+
return
|
261
|
+
end
|
262
|
+
elsif File.exists? dest
|
263
|
+
@log.info "Destination file exists => no download"
|
264
|
+
return
|
265
|
+
end
|
266
|
+
`mkdir -p "#{File.dirname(dest)}"` unless File.exists? File.dirname(dest)
|
267
|
+
@log.info "Destination file #{dest} is older => download"
|
268
|
+
begin
|
269
|
+
@log.debug(fetch_url url, dest)
|
270
|
+
rescue => ex
|
271
|
+
@log.error "Downloading #{url} failed"
|
272
|
+
@log.error ex
|
273
|
+
end
|
274
|
+
@processed_pdfs[url] = dest
|
275
|
+
access_pause_sleep
|
276
|
+
end
|
277
|
+
|
278
|
+
def get_dest_path url
|
279
|
+
url_path = get_uri(url).path
|
280
|
+
src_path = @conf["pdfs"]["src_path"]
|
281
|
+
dest_folder = @conf["pdfs"]["dest_folder"]
|
282
|
+
dest_folder + "/" + url_path.slice(src_path.length, url_path.length - src_path.length)
|
283
|
+
end
|
284
|
+
|
285
|
+
def self.add_service_class name, description, service_class, needs_auth = true, url_regex = nil
|
286
|
+
@@service_classes[name] = {
|
287
|
+
"class" => service_class,
|
288
|
+
"url_regex" => url_regex,
|
289
|
+
"description" => description,
|
290
|
+
"needs_auth" => needs_auth
|
291
|
+
}
|
292
|
+
end
|
293
|
+
|
294
|
+
def get_uri url
|
295
|
+
if @uri_cache[url] == nil
|
296
|
+
@uri_cache[url] = URI.parse url
|
297
|
+
end
|
298
|
+
return @uri_cache[url]
|
299
|
+
end
|
300
|
+
|
301
|
+
def url_chomp url
|
302
|
+
uri = get_uri url
|
303
|
+
uri.scheme + "://" + uri.host + uri.path + (uri.query != nil ? "?#{uri.query}" : "")
|
304
|
+
end
|
305
|
+
|
306
|
+
def self.get_services
|
307
|
+
@@service_classes.clone
|
308
|
+
end
|
309
|
+
|
310
|
+
def self.get_service_for_url url
|
311
|
+
@@service_classes.each do |name, service|
|
312
|
+
unless service["url_regex"] == nil && service["url_regex"] =~ url
|
313
|
+
return name
|
314
|
+
end
|
315
|
+
end
|
316
|
+
return "base"
|
317
|
+
end
|
318
|
+
|
319
|
+
self.add_service_class "base", "without any authentication", self, false, nil
|
320
|
+
|
321
|
+
end
|
322
|
+
|
323
|
+
class SecuredService < BaseService
|
324
|
+
|
325
|
+
def authenticate
|
326
|
+
unless @conf["auth"] != nil && @conf["auth"]["user"] != nil && @conf["auth"]["pass"]
|
327
|
+
raise "No authentication (user name and password) given"
|
328
|
+
end
|
329
|
+
_authenticate
|
330
|
+
end
|
331
|
+
|
332
|
+
def _authenticate
|
333
|
+
end
|
334
|
+
|
335
|
+
end
|
336
|
+
|
337
|
+
class StudiumKITService < SecuredService
|
338
|
+
|
339
|
+
def _authenticate
|
340
|
+
login_url = "https://studium.kit.edu/_layouts/login.aspx?ReturnUrl=%2f"
|
341
|
+
html = nil
|
342
|
+
post_html = nil
|
343
|
+
begin
|
344
|
+
html = fetch_url login_url
|
345
|
+
params = {
|
346
|
+
"ctl00$PlaceHolderMain$Login$UserName" => @conf["auth"]["user"],
|
347
|
+
"ctl00$PlaceHolderMain$Login$password" => @conf["auth"]["pass"],
|
348
|
+
"ctl00$PlaceHolderMain$Login$loginbutton" => "Anmelden",
|
349
|
+
"__VIEWSTATE" => get_field_value(html, "__VIEWSTATE"),
|
350
|
+
"__EVENTVALIDATION" => get_field_value(html, "__EVENTVALIDATION"),
|
351
|
+
"__spDummyText1" => "",
|
352
|
+
"__spDummyText2" => ""
|
353
|
+
}
|
354
|
+
rescue => ex
|
355
|
+
@log.fatal ex
|
356
|
+
raise "Fetching and parsing login page failed"
|
357
|
+
end
|
358
|
+
begin
|
359
|
+
post_html = post login_url, params
|
360
|
+
rescue => ex
|
361
|
+
@log.fatal ex
|
362
|
+
raise "POST request to login page failed"
|
363
|
+
end
|
364
|
+
if post_html == html
|
365
|
+
raise "Authentication failed, wrong user name or password"
|
366
|
+
end
|
367
|
+
end
|
368
|
+
|
369
|
+
self.add_service_class "studium_kit", "studium.kit.edu service", self, true, /studium\.kit\.edu/
|
370
|
+
|
371
|
+
end
|
372
|
+
|
373
|
+
class HTTPAuthService < SecuredService
|
374
|
+
|
375
|
+
def _authenticate
|
376
|
+
@auth_app = "-u #{URI::escape @conf["auth"]["user"]}:#{URI::escape @conf["auth"]["pass"]}"
|
377
|
+
end
|
378
|
+
|
379
|
+
self.add_service_class "http_auth", "http authenticated service", self, true, nil
|
380
|
+
|
381
|
+
end
|
metadata
ADDED
@@ -0,0 +1,120 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: kitcrawler
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Johannes Bechberger
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2014-07-12 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: nokogiri
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: 1.6.1
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: 1.6.1
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: highline
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: 1.6.0
|
38
|
+
type: :runtime
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: 1.6.0
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: thor
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: 0.19.0
|
54
|
+
type: :runtime
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: 0.19.0
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: damerau-levenshtein
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ! '>='
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: 1.0.0
|
70
|
+
type: :runtime
|
71
|
+
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ! '>='
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: 1.0.0
|
78
|
+
description: ! "\tCrawl lecture websites and fetch the PDFs automatically. \n\tIt
|
79
|
+
currently supports the studium.kit.edu and other HTTP password protected sites.\n"
|
80
|
+
email: me@mostlynerdless.de
|
81
|
+
executables:
|
82
|
+
- kitcrawler
|
83
|
+
extensions: []
|
84
|
+
extra_rdoc_files:
|
85
|
+
- README.md
|
86
|
+
files:
|
87
|
+
- lib/kitcrawler.rb
|
88
|
+
- lib/services.rb
|
89
|
+
- lib/cli.rb
|
90
|
+
- lib/cli_add.rb
|
91
|
+
- README.md
|
92
|
+
- bin/kitcrawler
|
93
|
+
homepage: https://github.com/parttimenerd/KITCrawler
|
94
|
+
licenses:
|
95
|
+
- GPL v3
|
96
|
+
post_install_message:
|
97
|
+
rdoc_options: []
|
98
|
+
require_paths:
|
99
|
+
- lib
|
100
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
101
|
+
none: false
|
102
|
+
requirements:
|
103
|
+
- - ! '>='
|
104
|
+
- !ruby/object:Gem::Version
|
105
|
+
version: 1.8.6
|
106
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
107
|
+
none: false
|
108
|
+
requirements:
|
109
|
+
- - ! '>='
|
110
|
+
- !ruby/object:Gem::Version
|
111
|
+
version: '0'
|
112
|
+
requirements:
|
113
|
+
- Linux (other UNIXes might also work)
|
114
|
+
- curl
|
115
|
+
rubyforge_project:
|
116
|
+
rubygems_version: 1.8.23
|
117
|
+
signing_key:
|
118
|
+
specification_version: 3
|
119
|
+
summary: Fetch lecture PDFs with ease
|
120
|
+
test_files: []
|