brandeins-dl 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +8 -0
- data/Rakefile +6 -1
- data/lib/brandeins-dl/version.rb +1 -1
- data/lib/brandeins-dl.rb +72 -58
- metadata +6 -12
data/Gemfile
CHANGED
data/Rakefile
CHANGED
data/lib/brandeins-dl/version.rb
CHANGED
data/lib/brandeins-dl.rb
CHANGED
@@ -1,9 +1,13 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
1
|
+
%w(
|
2
|
+
brandeins-dl/version
|
3
|
+
nokogiri
|
4
|
+
open-uri
|
5
|
+
uri
|
6
|
+
fileutils
|
7
|
+
thor
|
8
|
+
).each do |lib|
|
9
|
+
require lib
|
10
|
+
end
|
7
11
|
|
8
12
|
module BrandEins
|
9
13
|
|
@@ -28,94 +32,99 @@ module BrandEins
|
|
28
32
|
|
29
33
|
class Downloader
|
30
34
|
attr_reader :archive
|
31
|
-
|
35
|
+
|
32
36
|
def initialize(path)
|
33
|
-
@url =
|
34
|
-
@archive =
|
37
|
+
@url = 'http://www.brandeins.de'
|
38
|
+
@archive = false
|
35
39
|
@dl_dir = path
|
36
|
-
@tmp_dir = path +
|
37
|
-
|
38
|
-
check_download_path
|
40
|
+
@tmp_dir = path + '/tmp'
|
41
|
+
create_tmp_dirs
|
39
42
|
end
|
40
43
|
|
41
|
-
def
|
42
|
-
|
44
|
+
def setup
|
45
|
+
@archive = ArchiveSite.new @url
|
43
46
|
end
|
44
|
-
|
47
|
+
|
45
48
|
def get_magazines_of_year(year = 2000)
|
49
|
+
setup
|
46
50
|
puts "Getting all brand eins magazines of a #{year}. This could take a while..."
|
47
|
-
magazine_links_per_year = @archive.
|
51
|
+
magazine_links_per_year = @archive.get_magazine_links_by_year(year)
|
48
52
|
magazine_links_per_year.each_with_index do |magazine_link, volume|
|
49
53
|
puts "Parsing Volume #{volume} of #{year}"
|
50
54
|
target_pdf = get_target_pdf(year, volume)
|
51
55
|
get_magazine_by_link(magazine_link, target_pdf)
|
52
56
|
end
|
53
57
|
end
|
54
|
-
|
58
|
+
|
55
59
|
def get_magazine(year = 2000, volume = 1)
|
60
|
+
setup
|
56
61
|
puts "Parsing Volume #{volume} of #{year}"
|
57
62
|
target_pdf = get_target_pdf(year, volume)
|
58
|
-
|
59
|
-
magazine_links = @archive.
|
63
|
+
|
64
|
+
magazine_links = @archive.get_magazine_links_by_year(year)
|
60
65
|
target_magazine_link = magazine_links[volume-1]
|
61
|
-
|
66
|
+
|
62
67
|
get_magazine_by_link(target_magazine_link, target_pdf)
|
63
68
|
end
|
64
|
-
|
69
|
+
|
70
|
+
private
|
71
|
+
def create_tmp_dirs
|
72
|
+
FileUtils.mkdir_p @tmp_dir unless File.directory?(@tmp_dir)
|
73
|
+
end
|
74
|
+
|
65
75
|
def get_magazine_by_link(target_magazine_link, target_pdf)
|
66
76
|
pdf_links = @archive.magazine_pdf_links(target_magazine_link)
|
67
77
|
process_pdf_links(pdf_links, target_pdf)
|
68
78
|
cleanup
|
69
79
|
end
|
70
|
-
|
71
|
-
|
80
|
+
|
72
81
|
def get_target_pdf(year, volume)
|
73
82
|
"Brand-Eins-#{year}-#{volume}.pdf"
|
74
83
|
end
|
75
|
-
|
84
|
+
|
76
85
|
def process_pdf_links(pdf_links, target_pdf)
|
77
86
|
pdf_downloader = PDFDownloader.new(pdf_links, @tmp_dir)
|
78
87
|
pdf_files = pdf_downloader.download_all
|
79
88
|
merge_pdfs(pdf_files, target_pdf)
|
80
89
|
end
|
81
|
-
|
90
|
+
|
82
91
|
def merge_pdfs(pdf_files, target_pdf)
|
83
92
|
puts "Merging single PDFs now"
|
84
93
|
pdf_sources = pdf_files.join(" ")
|
85
94
|
system "pdftk #{pdf_sources} output #{@dl_dir}/#{target_pdf}"
|
86
95
|
end
|
87
|
-
|
96
|
+
|
88
97
|
def cleanup
|
89
98
|
FileUtils.rm_r @tmp_dir
|
90
99
|
end
|
91
|
-
|
100
|
+
|
92
101
|
class PDFDownloader
|
93
|
-
|
102
|
+
|
94
103
|
def initialize(pdf_links, dl_dir)
|
95
104
|
@dl_dir = dl_dir
|
96
105
|
@pdf_links = pdf_links
|
97
106
|
end
|
98
|
-
|
107
|
+
|
99
108
|
def download_all
|
100
109
|
pdf_files = Array.new
|
101
110
|
@pdf_links.each do |pdf_link|
|
102
111
|
pdf_name = @dl_dir + '/' + File.basename(pdf_link)
|
103
112
|
pdf_url = pdf_link
|
104
113
|
download_pdf(pdf_url, pdf_name)
|
105
|
-
|
114
|
+
|
106
115
|
pdf_files << pdf_name
|
107
116
|
end
|
108
117
|
pdf_files
|
109
118
|
end
|
110
|
-
|
111
|
-
private
|
112
|
-
|
119
|
+
|
120
|
+
private
|
121
|
+
|
113
122
|
def download_pdf(pdf_url, filename)
|
114
123
|
puts "Downloading PDF from #{pdf_url} to #{filename}"
|
115
124
|
File.open(filename,'w') do |f|
|
116
125
|
uri = URI.parse(pdf_url)
|
117
|
-
Net::HTTP.start(uri.host,uri.port) do |http|
|
118
|
-
http.request_get(uri.path) do |res|
|
126
|
+
Net::HTTP.start(uri.host,uri.port) do |http|
|
127
|
+
http.request_get(uri.path) do |res|
|
119
128
|
res.read_body do |seg|
|
120
129
|
f << seg
|
121
130
|
#hack -- adjust to suit:
|
@@ -125,20 +134,26 @@ module BrandEins
|
|
125
134
|
end
|
126
135
|
end
|
127
136
|
end
|
128
|
-
|
137
|
+
|
129
138
|
end
|
130
|
-
|
139
|
+
|
131
140
|
class ArchiveSite
|
132
|
-
|
133
141
|
attr_accessor :doc
|
134
|
-
|
135
|
-
def initialize
|
136
|
-
@base_url =
|
142
|
+
|
143
|
+
def initialize(base_url, html = false)
|
144
|
+
@base_url = base_url
|
137
145
|
@archive_url = @base_url + "/archiv.html"
|
146
|
+
if html
|
147
|
+
@doc = Nokogiri::HTML(html)
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
def setup
|
152
|
+
return if defined?(@doc) != nil
|
138
153
|
@doc = Nokogiri::HTML(open(@archive_url))
|
139
154
|
end
|
140
|
-
|
141
|
-
def
|
155
|
+
|
156
|
+
def get_magazine_links_by_year(year = 2000)
|
142
157
|
puts "Loading Magazine from year #{year}"
|
143
158
|
magazine_nodes_with_meta = @doc.css(".jahrgang-#{year} ul li")
|
144
159
|
magazine_links = Array.new
|
@@ -154,35 +169,35 @@ module BrandEins
|
|
154
169
|
end
|
155
170
|
magazine_links
|
156
171
|
end
|
157
|
-
|
172
|
+
|
158
173
|
def magazine_pdf_links(url)
|
159
|
-
magazine = ArchiveMagazine.new(url)
|
174
|
+
magazine = ArchiveMagazine.new(url, @base_url)
|
160
175
|
magazine.get_magazine_pdf_links
|
161
176
|
end
|
162
|
-
|
177
|
+
|
163
178
|
class ArchiveMagazine
|
164
179
|
attr_accessor :url, :doc
|
165
|
-
|
166
|
-
def initialize(url)
|
180
|
+
|
181
|
+
def initialize(url, base_url, html = false)
|
167
182
|
puts "Parsing #{url}"
|
168
183
|
@url = url
|
169
|
-
@base_url =
|
184
|
+
@base_url = base_url
|
170
185
|
@doc = Nokogiri::HTML(open(url))
|
171
186
|
end
|
172
|
-
|
187
|
+
|
173
188
|
def get_magazine_pdf_links
|
174
189
|
[get_editorial_article_links, get_schwerpunkt_article_links].flatten
|
175
|
-
|
190
|
+
|
176
191
|
end
|
177
|
-
|
192
|
+
|
178
193
|
def get_schwerpunkt_article_links
|
179
194
|
get_links("div.articleList ul h4 a")
|
180
195
|
end
|
181
|
-
|
196
|
+
|
182
197
|
def get_editorial_article_links
|
183
198
|
get_links(".editorial-links li a")
|
184
199
|
end
|
185
|
-
|
200
|
+
|
186
201
|
def get_links(css_selector)
|
187
202
|
pdf_links = Array.new
|
188
203
|
link_nodes = @doc.css(css_selector)
|
@@ -200,16 +215,16 @@ module BrandEins
|
|
200
215
|
end
|
201
216
|
pdf_links
|
202
217
|
end
|
203
|
-
|
218
|
+
|
204
219
|
class MagazineArticle
|
205
220
|
attr_accessor :url, :doc
|
206
|
-
|
221
|
+
|
207
222
|
def initialize(url)
|
208
223
|
puts "Parsing Article: #{url}"
|
209
224
|
@url = url
|
210
225
|
@doc = Nokogiri::HTML(open(url))
|
211
226
|
end
|
212
|
-
|
227
|
+
|
213
228
|
def get_pdf_link
|
214
229
|
link = @doc.css("div#sidebar ul li#downloaden a")
|
215
230
|
if link[0].nil? then
|
@@ -222,7 +237,6 @@ module BrandEins
|
|
222
237
|
end
|
223
238
|
|
224
239
|
end
|
225
|
-
|
226
240
|
end
|
227
241
|
|
228
242
|
end
|
metadata
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
name: brandeins-dl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 0.0.
|
5
|
+
version: 0.0.4
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- Gregory Igelmund
|
@@ -10,10 +10,11 @@ autorequire:
|
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
12
|
|
13
|
-
date: 2012-
|
13
|
+
date: 2012-10-06 00:00:00 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: rake
|
17
|
+
prerelease: false
|
17
18
|
requirement: &id001 !ruby/object:Gem::Requirement
|
18
19
|
none: false
|
19
20
|
requirements:
|
@@ -21,10 +22,10 @@ dependencies:
|
|
21
22
|
- !ruby/object:Gem::Version
|
22
23
|
version: "0"
|
23
24
|
type: :runtime
|
24
|
-
prerelease: false
|
25
25
|
version_requirements: *id001
|
26
26
|
- !ruby/object:Gem::Dependency
|
27
27
|
name: thor
|
28
|
+
prerelease: false
|
28
29
|
requirement: &id002 !ruby/object:Gem::Requirement
|
29
30
|
none: false
|
30
31
|
requirements:
|
@@ -32,10 +33,10 @@ dependencies:
|
|
32
33
|
- !ruby/object:Gem::Version
|
33
34
|
version: "0"
|
34
35
|
type: :runtime
|
35
|
-
prerelease: false
|
36
36
|
version_requirements: *id002
|
37
37
|
- !ruby/object:Gem::Dependency
|
38
38
|
name: nokogiri
|
39
|
+
prerelease: false
|
39
40
|
requirement: &id003 !ruby/object:Gem::Requirement
|
40
41
|
none: false
|
41
42
|
requirements:
|
@@ -43,7 +44,6 @@ dependencies:
|
|
43
44
|
- !ruby/object:Gem::Version
|
44
45
|
version: "0"
|
45
46
|
type: :runtime
|
46
|
-
prerelease: false
|
47
47
|
version_requirements: *id003
|
48
48
|
description: "BrandEins Downloader offers two commands: 'brandeins download YEAR' and 'brandeins download YEAR --volume=NUMBER'"
|
49
49
|
email:
|
@@ -77,23 +77,17 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
77
77
|
requirements:
|
78
78
|
- - ">="
|
79
79
|
- !ruby/object:Gem::Version
|
80
|
-
hash: -1677672259199041797
|
81
|
-
segments:
|
82
|
-
- 0
|
83
80
|
version: "0"
|
84
81
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
85
82
|
none: false
|
86
83
|
requirements:
|
87
84
|
- - ">="
|
88
85
|
- !ruby/object:Gem::Version
|
89
|
-
hash: -1677672259199041797
|
90
|
-
segments:
|
91
|
-
- 0
|
92
86
|
version: "0"
|
93
87
|
requirements: []
|
94
88
|
|
95
89
|
rubyforge_project:
|
96
|
-
rubygems_version: 1.8.
|
90
|
+
rubygems_version: 1.8.15
|
97
91
|
signing_key:
|
98
92
|
specification_version: 3
|
99
93
|
summary: BrandEins Downloader allows you to download past volumes of the Brand Eins magazine
|