brandeins 0.1.6 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +0 -1
- data/Gemfile +2 -1
- data/Gemfile.lock +50 -0
- data/README.md +2 -2
- data/Rakefile +18 -1
- data/lib/brandeins.rb +6 -310
- data/lib/brandeins/cli.rb +47 -0
- data/lib/brandeins/downloader.rb +112 -0
- data/lib/brandeins/merger/extensions.rb +9 -0
- data/lib/brandeins/merger/pdf_tools.rb +47 -0
- data/lib/brandeins/merger/templates/base.rb +47 -0
- data/lib/brandeins/merger/templates/ghostscript_windows.rb +25 -0
- data/lib/brandeins/merger/templates/osx.rb +11 -0
- data/lib/brandeins/merger/templates/pdftk_osx.rb +25 -0
- data/lib/brandeins/merger/templates/windows.rb +11 -0
- data/lib/brandeins/parser/archive_site.rb +54 -0
- data/lib/brandeins/parser/article_site.rb +26 -0
- data/lib/brandeins/parser/magazine_site.rb +49 -0
- data/lib/brandeins/setup.rb +4 -6
- data/lib/brandeins/version.rb +1 -1
- data/specs/brandeins_spec.rb +39 -0
- data/specs/spec_helper.rb +1 -0
- data/test/brandeins_test.rb +3 -4
- data/test/helper.rb +1 -19
- data/test_support/capture_stdout.rb +12 -0
- data/test_support/fixtures/brandeins_archiv.html +50 -0
- data/test_support/fixtures/cover.jpg +0 -0
- metadata +22 -5
- data/lib/brandeins/pdf-tools.rb +0 -89
data/.gitignore
CHANGED
data/Gemfile
CHANGED
data/Gemfile.lock
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
brandeins (0.2.0)
|
5
|
+
nokogiri
|
6
|
+
prawn
|
7
|
+
rake
|
8
|
+
thor
|
9
|
+
|
10
|
+
GEM
|
11
|
+
remote: http://rubygems.org/
|
12
|
+
specs:
|
13
|
+
Ascii85 (1.0.2)
|
14
|
+
addressable (2.3.2)
|
15
|
+
afm (0.2.0)
|
16
|
+
columnize (0.3.6)
|
17
|
+
crack (0.3.2)
|
18
|
+
debugger (1.3.1)
|
19
|
+
columnize (>= 0.3.1)
|
20
|
+
debugger-linecache (~> 1.1.1)
|
21
|
+
debugger-ruby_core_source (~> 1.1.8)
|
22
|
+
debugger-linecache (1.1.2)
|
23
|
+
debugger-ruby_core_source (>= 1.1.1)
|
24
|
+
debugger-ruby_core_source (1.1.8)
|
25
|
+
hashery (2.1.0)
|
26
|
+
nokogiri (1.5.6)
|
27
|
+
pdf-reader (1.3.0)
|
28
|
+
Ascii85 (~> 1.0.0)
|
29
|
+
afm (~> 0.2.0)
|
30
|
+
hashery (~> 2.0)
|
31
|
+
ruby-rc4
|
32
|
+
ttfunk
|
33
|
+
prawn (0.12.0)
|
34
|
+
pdf-reader (>= 0.9.0)
|
35
|
+
ttfunk (~> 1.0.2)
|
36
|
+
rake (10.0.3)
|
37
|
+
ruby-rc4 (0.1.5)
|
38
|
+
thor (0.17.0)
|
39
|
+
ttfunk (1.0.3)
|
40
|
+
webmock (1.9.0)
|
41
|
+
addressable (>= 2.2.7)
|
42
|
+
crack (>= 0.1.7)
|
43
|
+
|
44
|
+
PLATFORMS
|
45
|
+
ruby
|
46
|
+
|
47
|
+
DEPENDENCIES
|
48
|
+
brandeins!
|
49
|
+
debugger
|
50
|
+
webmock
|
data/README.md
CHANGED
@@ -12,7 +12,7 @@ several ruby libraries (that you can get through rubygems)
|
|
12
12
|
|
13
13
|
|
14
14
|
## Install
|
15
|
-
`gem install brandeins
|
15
|
+
`gem install brandeins`
|
16
16
|
|
17
17
|
|
18
18
|
## Usage
|
@@ -23,4 +23,4 @@ Download just one magazine
|
|
23
23
|
|
24
24
|
Download the whole collecion of a certain year
|
25
25
|
|
26
|
-
`brandeins download_all --path=/Path/where/to/download/the/files --year=2011`
|
26
|
+
`brandeins download_all --path=/Path/where/to/download/the/files --year=2011 --all`
|
data/Rakefile
CHANGED
@@ -3,7 +3,8 @@ require 'rake/testtask'
|
|
3
3
|
require './lib/brandeins/version'
|
4
4
|
|
5
5
|
Rake::TestTask.new do |t|
|
6
|
-
t.
|
6
|
+
t.test_files = FileList['test/*_test.rb', 'specs/*_spec.rb']
|
7
|
+
t.verbose = true
|
7
8
|
end
|
8
9
|
|
9
10
|
task :install do
|
@@ -15,4 +16,20 @@ task publish: [ :build ] do
|
|
15
16
|
sh "gem push ./pkg/brandeins-#{BrandEins::VERSION}.gem"
|
16
17
|
end
|
17
18
|
|
19
|
+
rule /^version:bump:(major|minor|patch)/ do |t|
|
20
|
+
sh "git status | grep 'nothing to commit'"
|
21
|
+
index = ['major', 'minor','patch'].index(t.name.split(':').last)
|
22
|
+
file = 'lib/brandeins/version.rb'
|
23
|
+
|
24
|
+
version_file = File.read(file)
|
25
|
+
old_version, *version_parts = version_file.match(/(\d+)\.(\d+)\.(\d+)/).to_a
|
26
|
+
version_parts[index] = version_parts[index].to_i + 1
|
27
|
+
version_parts[2] = 0 if index < 2
|
28
|
+
version_parts[1] = 0 if index < 1
|
29
|
+
new_version = version_parts * '.'
|
30
|
+
File.open(file,'w'){|f| f.write(version_file.sub(old_version, new_version)) }
|
31
|
+
|
32
|
+
sh "git add #{file} Gemfile.lock && git commit -m 'bump version to #{new_version}'"
|
33
|
+
end
|
34
|
+
|
18
35
|
task :default => :test
|
data/lib/brandeins.rb
CHANGED
@@ -1,312 +1,8 @@
|
|
1
|
-
|
2
|
-
brandeins/version
|
3
|
-
brandeins/setup
|
4
|
-
brandeins/pdf-tools
|
5
|
-
nokogiri
|
6
|
-
open-uri
|
7
|
-
uri
|
8
|
-
fileutils
|
9
|
-
thor
|
10
|
-
prawn
|
11
|
-
).each do |lib|
|
12
|
-
begin
|
13
|
-
require lib
|
14
|
-
rescue Exception => e
|
15
|
-
puts "missing #{lib}, #{e.inspect}"
|
16
|
-
end
|
17
|
-
end
|
1
|
+
# encoding: utf-8
|
18
2
|
|
19
|
-
|
3
|
+
require 'brandeins/version'
|
4
|
+
require 'brandeins/downloader'
|
5
|
+
require 'brandeins/setup'
|
6
|
+
require 'brandeins/cli'
|
20
7
|
|
21
|
-
|
22
|
-
map '--version' => :version
|
23
|
-
|
24
|
-
desc '--version', 'Displays current version'
|
25
|
-
def version
|
26
|
-
puts BrandEins::VERSION
|
27
|
-
end
|
28
|
-
|
29
|
-
desc 'download', 'Download past brand eins magazines (use `brandeins help download` to learn more about options)'
|
30
|
-
method_option :path, :type => :string
|
31
|
-
method_option :volume, :type => :numeric
|
32
|
-
method_option :all
|
33
|
-
method_option :year, :type => :numeric
|
34
|
-
def download
|
35
|
-
path = options.path ? File.expand_path(options.path) : Dir.pwd
|
36
|
-
year = options.year || Time.new.year
|
37
|
-
all = options.all
|
38
|
-
volume = options.volume
|
39
|
-
|
40
|
-
if volume.nil? && all.nil?
|
41
|
-
puts "If you want to download a specific volune use the --volume flag or use --all to download all volumes of a year"
|
42
|
-
else
|
43
|
-
downloader = BrandEins::Downloader.new path
|
44
|
-
if !all.nil?
|
45
|
-
downloader.get_magazines_of_year year
|
46
|
-
else
|
47
|
-
downloader.get_magazine year, volume
|
48
|
-
end
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
52
|
-
desc 'setup', 'Checks if all requirements for using brandeins gem are met'
|
53
|
-
method_option :help
|
54
|
-
def setup
|
55
|
-
setup = BrandEins::Setup.new
|
56
|
-
if !options.help.nil?
|
57
|
-
setup.help
|
58
|
-
else
|
59
|
-
setup.run
|
60
|
-
end
|
61
|
-
end
|
62
|
-
end
|
63
|
-
|
64
|
-
class Downloader
|
65
|
-
attr_reader :archive
|
66
|
-
|
67
|
-
def initialize(path)
|
68
|
-
@url = 'http://www.brandeins.de'
|
69
|
-
@archive = false
|
70
|
-
@dl_dir = path
|
71
|
-
@tmp_dir = path + '/brand-eins-tmp'
|
72
|
-
@pdftool = BrandEins::PdfTools.get_pdf_tool
|
73
|
-
create_tmp_dirs
|
74
|
-
end
|
75
|
-
|
76
|
-
def setup
|
77
|
-
@archive = ArchiveSite.new @url
|
78
|
-
end
|
79
|
-
|
80
|
-
def get_magazines_of_year(year = 2000)
|
81
|
-
setup
|
82
|
-
puts "Getting all brand eins magazines of #{year}. This could take a while..."
|
83
|
-
magazine_links_per_year = @archive.get_magazine_links_by_year(year)
|
84
|
-
magazine_links_per_year.each_with_index do |magazine_link, index|
|
85
|
-
volume = index+1
|
86
|
-
puts "Parsing Volume #{volume} of #{year}"
|
87
|
-
target_pdf = get_target_pdf(year, volume)
|
88
|
-
get_magazine_by_link(magazine_link, target_pdf, year, volume)
|
89
|
-
end
|
90
|
-
end
|
91
|
-
|
92
|
-
def get_magazine(year = 2000, volume = 1)
|
93
|
-
setup
|
94
|
-
puts "Parsing Volume #{volume} of #{year}"
|
95
|
-
target_pdf = get_target_pdf(year, volume)
|
96
|
-
|
97
|
-
magazine_links = @archive.get_magazine_links_by_year(year)
|
98
|
-
target_magazine_link = magazine_links[volume-1]
|
99
|
-
|
100
|
-
get_magazine_by_link(target_magazine_link, target_pdf, year, volume)
|
101
|
-
end
|
102
|
-
|
103
|
-
private
|
104
|
-
def create_tmp_dirs
|
105
|
-
FileUtils.mkdir_p @tmp_dir unless File.directory?(@tmp_dir)
|
106
|
-
end
|
107
|
-
|
108
|
-
def get_magazine_by_link(target_magazine_link, target_pdf, year, volume)
|
109
|
-
pdf_links = @archive.magazine_pdf_links(target_magazine_link)
|
110
|
-
pdf_files = download_pdfs(pdf_links)
|
111
|
-
|
112
|
-
pdf_cover = create_cover_pdf(year, volume)
|
113
|
-
pdf_files = pdf_files.reverse.push(pdf_cover).reverse
|
114
|
-
|
115
|
-
if !@pdftool.nil?
|
116
|
-
target_pdf_path = "#{@dl_dir}/#{target_pdf}"
|
117
|
-
@pdftool.merge_pdf_files(pdf_files, target_pdf_path)
|
118
|
-
cleanup
|
119
|
-
else
|
120
|
-
if RUBY_PLATFORM.include? 'darwin'
|
121
|
-
puts 'brandeins wont merge the single pdf files since it didnt find the pdftk tool'
|
122
|
-
end
|
123
|
-
end
|
124
|
-
end
|
125
|
-
|
126
|
-
def create_cover_pdf(year, volume)
|
127
|
-
cover = @archive.get_magazine_cover(year, volume)
|
128
|
-
cover_title = cover[:title]
|
129
|
-
cover_img_url = cover[:img_url]
|
130
|
-
cover_img_file = @tmp_dir + "/cover-#{year}-#{volume}.jpg"
|
131
|
-
cover_pdf_file = @tmp_dir + "/cover-#{year}-#{volume}.pdf"
|
132
|
-
|
133
|
-
File.open(cover_img_file,'w') do |f|
|
134
|
-
uri = URI.parse(cover_img_url)
|
135
|
-
Net::HTTP.start(uri.host,uri.port) do |http|
|
136
|
-
http.request_get(uri.path) do |res|
|
137
|
-
res.read_body do |seg|
|
138
|
-
f << seg
|
139
|
-
#hack -- adjust to suit:
|
140
|
-
sleep 0.005
|
141
|
-
end
|
142
|
-
end
|
143
|
-
end
|
144
|
-
end
|
145
|
-
|
146
|
-
require 'prawn'
|
147
|
-
Prawn::Document.generate(cover_pdf_file) do |pdf|
|
148
|
-
pdf.text "<font size='18'><b>" + cover_title + "</b></font>", :align => :center, :inline_format => true
|
149
|
-
pdf.image cover_img_file, :position => :center, :vposition => :center
|
150
|
-
end
|
151
|
-
return cover_pdf_file
|
152
|
-
end
|
153
|
-
|
154
|
-
def get_target_pdf(year, volume)
|
155
|
-
"Brand-Eins-#{year}-#{volume}.pdf"
|
156
|
-
end
|
157
|
-
|
158
|
-
def download_pdfs(pdf_links)
|
159
|
-
pdf_downloader = PDFDownloader.new(pdf_links, @tmp_dir)
|
160
|
-
pdf_downloader.download_all
|
161
|
-
end
|
162
|
-
|
163
|
-
def cleanup
|
164
|
-
FileUtils.rm_r @tmp_dir
|
165
|
-
end
|
166
|
-
|
167
|
-
class PDFDownloader
|
168
|
-
|
169
|
-
def initialize(pdf_links, dl_dir)
|
170
|
-
@dl_dir = dl_dir
|
171
|
-
@pdf_links = pdf_links
|
172
|
-
end
|
173
|
-
|
174
|
-
def download_all
|
175
|
-
pdf_files = Array.new
|
176
|
-
@pdf_links.each do |pdf_link|
|
177
|
-
pdf_name = @dl_dir + '/' + File.basename(pdf_link)
|
178
|
-
pdf_url = pdf_link
|
179
|
-
download_pdf(pdf_url, pdf_name)
|
180
|
-
pdf_files << pdf_name
|
181
|
-
end
|
182
|
-
pdf_files
|
183
|
-
end
|
184
|
-
|
185
|
-
private
|
186
|
-
|
187
|
-
def download_pdf(pdf_url, filename)
|
188
|
-
if File.exists? filename
|
189
|
-
puts "File #{filename} seems to be already downloaded"
|
190
|
-
return true
|
191
|
-
end
|
192
|
-
|
193
|
-
puts "Downloading PDF from #{pdf_url} to #{filename}"
|
194
|
-
File.open(filename,'wb') do |new_file|
|
195
|
-
open(pdf_url, 'rb') do |read_file|
|
196
|
-
new_file.write(read_file.read)
|
197
|
-
end
|
198
|
-
end
|
199
|
-
end
|
200
|
-
|
201
|
-
end
|
202
|
-
|
203
|
-
class ArchiveSite
|
204
|
-
attr_accessor :doc
|
205
|
-
|
206
|
-
def initialize(base_url, html = false)
|
207
|
-
@base_url = base_url
|
208
|
-
@archive_url = @base_url + "/archiv.html"
|
209
|
-
if html
|
210
|
-
@doc = Nokogiri::HTML(html)
|
211
|
-
end
|
212
|
-
end
|
213
|
-
|
214
|
-
def setup
|
215
|
-
return if defined?(@doc) != nil
|
216
|
-
@doc = Nokogiri::HTML(open(@archive_url))
|
217
|
-
end
|
218
|
-
|
219
|
-
def get_magazine_links_by_year(year = 2000)
|
220
|
-
setup
|
221
|
-
puts "Loading Magazine from year #{year}"
|
222
|
-
magazine_nodes_with_meta = @doc.css(".jahrgang-#{year} ul li")
|
223
|
-
magazine_links = Array.new
|
224
|
-
magazine_nodes_with_meta.each_with_index do |node, index|
|
225
|
-
if node['id'].nil? then
|
226
|
-
link = node.css('a')
|
227
|
-
if link[0].nil? then
|
228
|
-
next
|
229
|
-
end
|
230
|
-
href = link[0]['href']
|
231
|
-
magazine_links << @base_url + '/' + href
|
232
|
-
end
|
233
|
-
end
|
234
|
-
magazine_links
|
235
|
-
end
|
236
|
-
|
237
|
-
def get_magazine_cover(year, volume)
|
238
|
-
title = @doc.css("#month_detail_#{year}_#{volume} .titel").children[0].to_s
|
239
|
-
img_url = ''
|
240
|
-
@doc.css("#month_detail_#{year}_#{volume} .cover a img").each do |node|
|
241
|
-
img_url = node['src']
|
242
|
-
end
|
243
|
-
return { :title => title, :img_url => @base_url + '/' + img_url }
|
244
|
-
end
|
245
|
-
|
246
|
-
def magazine_pdf_links(url)
|
247
|
-
magazine = ArchiveMagazine.new(url, @base_url)
|
248
|
-
magazine.get_magazine_pdf_links
|
249
|
-
end
|
250
|
-
|
251
|
-
class ArchiveMagazine
|
252
|
-
attr_accessor :url, :doc
|
253
|
-
|
254
|
-
def initialize(url, base_url, html = false)
|
255
|
-
puts "Parsing #{url}"
|
256
|
-
@url = url
|
257
|
-
@base_url = base_url
|
258
|
-
@doc = Nokogiri::HTML(open(url))
|
259
|
-
end
|
260
|
-
|
261
|
-
def get_magazine_pdf_links
|
262
|
-
[get_editorial_article_links, get_schwerpunkt_article_links].flatten
|
263
|
-
end
|
264
|
-
|
265
|
-
def get_schwerpunkt_article_links
|
266
|
-
get_links("div.articleList ul h4 a")
|
267
|
-
end
|
268
|
-
|
269
|
-
def get_editorial_article_links
|
270
|
-
get_links(".editorial-links li a")
|
271
|
-
end
|
272
|
-
|
273
|
-
def get_links(css_selector)
|
274
|
-
pdf_links = Array.new
|
275
|
-
link_nodes = @doc.css(css_selector)
|
276
|
-
link_nodes.each do |node|
|
277
|
-
article_link = @base_url + '/' + node['href']
|
278
|
-
article = MagazineArticle.new(article_link)
|
279
|
-
pdf_link = article.get_pdf_link
|
280
|
-
if pdf_link.nil? then
|
281
|
-
puts "------------------------------"
|
282
|
-
puts "No Content for: #{article_link}"
|
283
|
-
puts "------------------------------"
|
284
|
-
else
|
285
|
-
pdf_links << @base_url + '/' + pdf_link
|
286
|
-
end
|
287
|
-
end
|
288
|
-
pdf_links
|
289
|
-
end
|
290
|
-
|
291
|
-
class MagazineArticle
|
292
|
-
attr_accessor :url, :doc
|
293
|
-
|
294
|
-
def initialize(url)
|
295
|
-
puts "Parsing Article: #{url}"
|
296
|
-
@url = url
|
297
|
-
@doc = Nokogiri::HTML(open(url))
|
298
|
-
end
|
299
|
-
|
300
|
-
def get_pdf_link
|
301
|
-
link = @doc.css("div#sidebar ul li#downloaden a")
|
302
|
-
if link[0].nil? then
|
303
|
-
return nil
|
304
|
-
else
|
305
|
-
return link[0]['href']
|
306
|
-
end
|
307
|
-
end
|
308
|
-
end
|
309
|
-
end
|
310
|
-
end
|
311
|
-
end
|
312
|
-
end
|
8
|
+
module BrandEins; end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'thor'
|
3
|
+
|
4
|
+
module BrandEins
|
5
|
+
class CLI < Thor
|
6
|
+
map '--version' => :version
|
7
|
+
|
8
|
+
desc '--version', 'Displays current version'
|
9
|
+
def version
|
10
|
+
puts BrandEins::VERSION
|
11
|
+
end
|
12
|
+
|
13
|
+
desc 'download', 'Download past brand eins magazines (use `brandeins help download` to learn more about options)'
|
14
|
+
method_option :path, :type => :string
|
15
|
+
method_option :volume, :type => :numeric
|
16
|
+
method_option :all
|
17
|
+
method_option :year, :type => :numeric
|
18
|
+
def download
|
19
|
+
path = options.path ? File.expand_path(options.path) : Dir.pwd
|
20
|
+
year = options.year || Time.new.year
|
21
|
+
all = options.all
|
22
|
+
volume = options.volume
|
23
|
+
|
24
|
+
if volume.nil? and all.nil?
|
25
|
+
puts "If you want to download a specific volune use the --volume flag or use --all to download all volumes of a year"
|
26
|
+
else
|
27
|
+
downloader = BrandEins::Downloader.new(path, verbose: true)
|
28
|
+
if !all.nil?
|
29
|
+
downloader.get_magazines_of_year year
|
30
|
+
else
|
31
|
+
downloader.get_magazine year, volume
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
desc 'setup', 'Checks if all requirements for using brandeins gem are met'
|
37
|
+
method_option :help
|
38
|
+
def setup
|
39
|
+
setup = BrandEins::Setup.new
|
40
|
+
if !options.help.nil?
|
41
|
+
setup.help
|
42
|
+
else
|
43
|
+
setup.run
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|