brandeins 0.1.6 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +0 -1
- data/Gemfile +2 -1
- data/Gemfile.lock +50 -0
- data/README.md +2 -2
- data/Rakefile +18 -1
- data/lib/brandeins.rb +6 -310
- data/lib/brandeins/cli.rb +47 -0
- data/lib/brandeins/downloader.rb +112 -0
- data/lib/brandeins/merger/extensions.rb +9 -0
- data/lib/brandeins/merger/pdf_tools.rb +47 -0
- data/lib/brandeins/merger/templates/base.rb +47 -0
- data/lib/brandeins/merger/templates/ghostscript_windows.rb +25 -0
- data/lib/brandeins/merger/templates/osx.rb +11 -0
- data/lib/brandeins/merger/templates/pdftk_osx.rb +25 -0
- data/lib/brandeins/merger/templates/windows.rb +11 -0
- data/lib/brandeins/parser/archive_site.rb +54 -0
- data/lib/brandeins/parser/article_site.rb +26 -0
- data/lib/brandeins/parser/magazine_site.rb +49 -0
- data/lib/brandeins/setup.rb +4 -6
- data/lib/brandeins/version.rb +1 -1
- data/specs/brandeins_spec.rb +39 -0
- data/specs/spec_helper.rb +1 -0
- data/test/brandeins_test.rb +3 -4
- data/test/helper.rb +1 -19
- data/test_support/capture_stdout.rb +12 -0
- data/test_support/fixtures/brandeins_archiv.html +50 -0
- data/test_support/fixtures/cover.jpg +0 -0
- metadata +22 -5
- data/lib/brandeins/pdf-tools.rb +0 -89
data/.gitignore
CHANGED
data/Gemfile
CHANGED
data/Gemfile.lock
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
brandeins (0.2.0)
|
5
|
+
nokogiri
|
6
|
+
prawn
|
7
|
+
rake
|
8
|
+
thor
|
9
|
+
|
10
|
+
GEM
|
11
|
+
remote: http://rubygems.org/
|
12
|
+
specs:
|
13
|
+
Ascii85 (1.0.2)
|
14
|
+
addressable (2.3.2)
|
15
|
+
afm (0.2.0)
|
16
|
+
columnize (0.3.6)
|
17
|
+
crack (0.3.2)
|
18
|
+
debugger (1.3.1)
|
19
|
+
columnize (>= 0.3.1)
|
20
|
+
debugger-linecache (~> 1.1.1)
|
21
|
+
debugger-ruby_core_source (~> 1.1.8)
|
22
|
+
debugger-linecache (1.1.2)
|
23
|
+
debugger-ruby_core_source (>= 1.1.1)
|
24
|
+
debugger-ruby_core_source (1.1.8)
|
25
|
+
hashery (2.1.0)
|
26
|
+
nokogiri (1.5.6)
|
27
|
+
pdf-reader (1.3.0)
|
28
|
+
Ascii85 (~> 1.0.0)
|
29
|
+
afm (~> 0.2.0)
|
30
|
+
hashery (~> 2.0)
|
31
|
+
ruby-rc4
|
32
|
+
ttfunk
|
33
|
+
prawn (0.12.0)
|
34
|
+
pdf-reader (>= 0.9.0)
|
35
|
+
ttfunk (~> 1.0.2)
|
36
|
+
rake (10.0.3)
|
37
|
+
ruby-rc4 (0.1.5)
|
38
|
+
thor (0.17.0)
|
39
|
+
ttfunk (1.0.3)
|
40
|
+
webmock (1.9.0)
|
41
|
+
addressable (>= 2.2.7)
|
42
|
+
crack (>= 0.1.7)
|
43
|
+
|
44
|
+
PLATFORMS
|
45
|
+
ruby
|
46
|
+
|
47
|
+
DEPENDENCIES
|
48
|
+
brandeins!
|
49
|
+
debugger
|
50
|
+
webmock
|
data/README.md
CHANGED
@@ -12,7 +12,7 @@ several ruby libraries (that you can get through rubygems)
|
|
12
12
|
|
13
13
|
|
14
14
|
## Install
|
15
|
-
`gem install brandeins
|
15
|
+
`gem install brandeins`
|
16
16
|
|
17
17
|
|
18
18
|
## Usage
|
@@ -23,4 +23,4 @@ Download just one magazine
|
|
23
23
|
|
24
24
|
Download the whole collecion of a certain year
|
25
25
|
|
26
|
-
`brandeins download_all --path=/Path/where/to/download/the/files --year=2011`
|
26
|
+
`brandeins download_all --path=/Path/where/to/download/the/files --year=2011 --all`
|
data/Rakefile
CHANGED
@@ -3,7 +3,8 @@ require 'rake/testtask'
|
|
3
3
|
require './lib/brandeins/version'
|
4
4
|
|
5
5
|
Rake::TestTask.new do |t|
|
6
|
-
t.
|
6
|
+
t.test_files = FileList['test/*_test.rb', 'specs/*_spec.rb']
|
7
|
+
t.verbose = true
|
7
8
|
end
|
8
9
|
|
9
10
|
task :install do
|
@@ -15,4 +16,20 @@ task publish: [ :build ] do
|
|
15
16
|
sh "gem push ./pkg/brandeins-#{BrandEins::VERSION}.gem"
|
16
17
|
end
|
17
18
|
|
19
|
+
rule /^version:bump:(major|minor|patch)/ do |t|
|
20
|
+
sh "git status | grep 'nothing to commit'"
|
21
|
+
index = ['major', 'minor','patch'].index(t.name.split(':').last)
|
22
|
+
file = 'lib/brandeins/version.rb'
|
23
|
+
|
24
|
+
version_file = File.read(file)
|
25
|
+
old_version, *version_parts = version_file.match(/(\d+)\.(\d+)\.(\d+)/).to_a
|
26
|
+
version_parts[index] = version_parts[index].to_i + 1
|
27
|
+
version_parts[2] = 0 if index < 2
|
28
|
+
version_parts[1] = 0 if index < 1
|
29
|
+
new_version = version_parts * '.'
|
30
|
+
File.open(file,'w'){|f| f.write(version_file.sub(old_version, new_version)) }
|
31
|
+
|
32
|
+
sh "git add #{file} Gemfile.lock && git commit -m 'bump version to #{new_version}'"
|
33
|
+
end
|
34
|
+
|
18
35
|
task :default => :test
|
data/lib/brandeins.rb
CHANGED
@@ -1,312 +1,8 @@
|
|
1
|
-
|
2
|
-
brandeins/version
|
3
|
-
brandeins/setup
|
4
|
-
brandeins/pdf-tools
|
5
|
-
nokogiri
|
6
|
-
open-uri
|
7
|
-
uri
|
8
|
-
fileutils
|
9
|
-
thor
|
10
|
-
prawn
|
11
|
-
).each do |lib|
|
12
|
-
begin
|
13
|
-
require lib
|
14
|
-
rescue Exception => e
|
15
|
-
puts "missing #{lib}, #{e.inspect}"
|
16
|
-
end
|
17
|
-
end
|
1
|
+
# encoding: utf-8
|
18
2
|
|
19
|
-
|
3
|
+
require 'brandeins/version'
|
4
|
+
require 'brandeins/downloader'
|
5
|
+
require 'brandeins/setup'
|
6
|
+
require 'brandeins/cli'
|
20
7
|
|
21
|
-
|
22
|
-
map '--version' => :version
|
23
|
-
|
24
|
-
desc '--version', 'Displays current version'
|
25
|
-
def version
|
26
|
-
puts BrandEins::VERSION
|
27
|
-
end
|
28
|
-
|
29
|
-
desc 'download', 'Download past brand eins magazines (use `brandeins help download` to learn more about options)'
|
30
|
-
method_option :path, :type => :string
|
31
|
-
method_option :volume, :type => :numeric
|
32
|
-
method_option :all
|
33
|
-
method_option :year, :type => :numeric
|
34
|
-
def download
|
35
|
-
path = options.path ? File.expand_path(options.path) : Dir.pwd
|
36
|
-
year = options.year || Time.new.year
|
37
|
-
all = options.all
|
38
|
-
volume = options.volume
|
39
|
-
|
40
|
-
if volume.nil? && all.nil?
|
41
|
-
puts "If you want to download a specific volune use the --volume flag or use --all to download all volumes of a year"
|
42
|
-
else
|
43
|
-
downloader = BrandEins::Downloader.new path
|
44
|
-
if !all.nil?
|
45
|
-
downloader.get_magazines_of_year year
|
46
|
-
else
|
47
|
-
downloader.get_magazine year, volume
|
48
|
-
end
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
52
|
-
desc 'setup', 'Checks if all requirements for using brandeins gem are met'
|
53
|
-
method_option :help
|
54
|
-
def setup
|
55
|
-
setup = BrandEins::Setup.new
|
56
|
-
if !options.help.nil?
|
57
|
-
setup.help
|
58
|
-
else
|
59
|
-
setup.run
|
60
|
-
end
|
61
|
-
end
|
62
|
-
end
|
63
|
-
|
64
|
-
class Downloader
|
65
|
-
attr_reader :archive
|
66
|
-
|
67
|
-
def initialize(path)
|
68
|
-
@url = 'http://www.brandeins.de'
|
69
|
-
@archive = false
|
70
|
-
@dl_dir = path
|
71
|
-
@tmp_dir = path + '/brand-eins-tmp'
|
72
|
-
@pdftool = BrandEins::PdfTools.get_pdf_tool
|
73
|
-
create_tmp_dirs
|
74
|
-
end
|
75
|
-
|
76
|
-
def setup
|
77
|
-
@archive = ArchiveSite.new @url
|
78
|
-
end
|
79
|
-
|
80
|
-
def get_magazines_of_year(year = 2000)
|
81
|
-
setup
|
82
|
-
puts "Getting all brand eins magazines of #{year}. This could take a while..."
|
83
|
-
magazine_links_per_year = @archive.get_magazine_links_by_year(year)
|
84
|
-
magazine_links_per_year.each_with_index do |magazine_link, index|
|
85
|
-
volume = index+1
|
86
|
-
puts "Parsing Volume #{volume} of #{year}"
|
87
|
-
target_pdf = get_target_pdf(year, volume)
|
88
|
-
get_magazine_by_link(magazine_link, target_pdf, year, volume)
|
89
|
-
end
|
90
|
-
end
|
91
|
-
|
92
|
-
def get_magazine(year = 2000, volume = 1)
|
93
|
-
setup
|
94
|
-
puts "Parsing Volume #{volume} of #{year}"
|
95
|
-
target_pdf = get_target_pdf(year, volume)
|
96
|
-
|
97
|
-
magazine_links = @archive.get_magazine_links_by_year(year)
|
98
|
-
target_magazine_link = magazine_links[volume-1]
|
99
|
-
|
100
|
-
get_magazine_by_link(target_magazine_link, target_pdf, year, volume)
|
101
|
-
end
|
102
|
-
|
103
|
-
private
|
104
|
-
def create_tmp_dirs
|
105
|
-
FileUtils.mkdir_p @tmp_dir unless File.directory?(@tmp_dir)
|
106
|
-
end
|
107
|
-
|
108
|
-
def get_magazine_by_link(target_magazine_link, target_pdf, year, volume)
|
109
|
-
pdf_links = @archive.magazine_pdf_links(target_magazine_link)
|
110
|
-
pdf_files = download_pdfs(pdf_links)
|
111
|
-
|
112
|
-
pdf_cover = create_cover_pdf(year, volume)
|
113
|
-
pdf_files = pdf_files.reverse.push(pdf_cover).reverse
|
114
|
-
|
115
|
-
if !@pdftool.nil?
|
116
|
-
target_pdf_path = "#{@dl_dir}/#{target_pdf}"
|
117
|
-
@pdftool.merge_pdf_files(pdf_files, target_pdf_path)
|
118
|
-
cleanup
|
119
|
-
else
|
120
|
-
if RUBY_PLATFORM.include? 'darwin'
|
121
|
-
puts 'brandeins wont merge the single pdf files since it didnt find the pdftk tool'
|
122
|
-
end
|
123
|
-
end
|
124
|
-
end
|
125
|
-
|
126
|
-
def create_cover_pdf(year, volume)
|
127
|
-
cover = @archive.get_magazine_cover(year, volume)
|
128
|
-
cover_title = cover[:title]
|
129
|
-
cover_img_url = cover[:img_url]
|
130
|
-
cover_img_file = @tmp_dir + "/cover-#{year}-#{volume}.jpg"
|
131
|
-
cover_pdf_file = @tmp_dir + "/cover-#{year}-#{volume}.pdf"
|
132
|
-
|
133
|
-
File.open(cover_img_file,'w') do |f|
|
134
|
-
uri = URI.parse(cover_img_url)
|
135
|
-
Net::HTTP.start(uri.host,uri.port) do |http|
|
136
|
-
http.request_get(uri.path) do |res|
|
137
|
-
res.read_body do |seg|
|
138
|
-
f << seg
|
139
|
-
#hack -- adjust to suit:
|
140
|
-
sleep 0.005
|
141
|
-
end
|
142
|
-
end
|
143
|
-
end
|
144
|
-
end
|
145
|
-
|
146
|
-
require 'prawn'
|
147
|
-
Prawn::Document.generate(cover_pdf_file) do |pdf|
|
148
|
-
pdf.text "<font size='18'><b>" + cover_title + "</b></font>", :align => :center, :inline_format => true
|
149
|
-
pdf.image cover_img_file, :position => :center, :vposition => :center
|
150
|
-
end
|
151
|
-
return cover_pdf_file
|
152
|
-
end
|
153
|
-
|
154
|
-
def get_target_pdf(year, volume)
|
155
|
-
"Brand-Eins-#{year}-#{volume}.pdf"
|
156
|
-
end
|
157
|
-
|
158
|
-
def download_pdfs(pdf_links)
|
159
|
-
pdf_downloader = PDFDownloader.new(pdf_links, @tmp_dir)
|
160
|
-
pdf_downloader.download_all
|
161
|
-
end
|
162
|
-
|
163
|
-
def cleanup
|
164
|
-
FileUtils.rm_r @tmp_dir
|
165
|
-
end
|
166
|
-
|
167
|
-
class PDFDownloader
|
168
|
-
|
169
|
-
def initialize(pdf_links, dl_dir)
|
170
|
-
@dl_dir = dl_dir
|
171
|
-
@pdf_links = pdf_links
|
172
|
-
end
|
173
|
-
|
174
|
-
def download_all
|
175
|
-
pdf_files = Array.new
|
176
|
-
@pdf_links.each do |pdf_link|
|
177
|
-
pdf_name = @dl_dir + '/' + File.basename(pdf_link)
|
178
|
-
pdf_url = pdf_link
|
179
|
-
download_pdf(pdf_url, pdf_name)
|
180
|
-
pdf_files << pdf_name
|
181
|
-
end
|
182
|
-
pdf_files
|
183
|
-
end
|
184
|
-
|
185
|
-
private
|
186
|
-
|
187
|
-
def download_pdf(pdf_url, filename)
|
188
|
-
if File.exists? filename
|
189
|
-
puts "File #{filename} seems to be already downloaded"
|
190
|
-
return true
|
191
|
-
end
|
192
|
-
|
193
|
-
puts "Downloading PDF from #{pdf_url} to #{filename}"
|
194
|
-
File.open(filename,'wb') do |new_file|
|
195
|
-
open(pdf_url, 'rb') do |read_file|
|
196
|
-
new_file.write(read_file.read)
|
197
|
-
end
|
198
|
-
end
|
199
|
-
end
|
200
|
-
|
201
|
-
end
|
202
|
-
|
203
|
-
class ArchiveSite
|
204
|
-
attr_accessor :doc
|
205
|
-
|
206
|
-
def initialize(base_url, html = false)
|
207
|
-
@base_url = base_url
|
208
|
-
@archive_url = @base_url + "/archiv.html"
|
209
|
-
if html
|
210
|
-
@doc = Nokogiri::HTML(html)
|
211
|
-
end
|
212
|
-
end
|
213
|
-
|
214
|
-
def setup
|
215
|
-
return if defined?(@doc) != nil
|
216
|
-
@doc = Nokogiri::HTML(open(@archive_url))
|
217
|
-
end
|
218
|
-
|
219
|
-
def get_magazine_links_by_year(year = 2000)
|
220
|
-
setup
|
221
|
-
puts "Loading Magazine from year #{year}"
|
222
|
-
magazine_nodes_with_meta = @doc.css(".jahrgang-#{year} ul li")
|
223
|
-
magazine_links = Array.new
|
224
|
-
magazine_nodes_with_meta.each_with_index do |node, index|
|
225
|
-
if node['id'].nil? then
|
226
|
-
link = node.css('a')
|
227
|
-
if link[0].nil? then
|
228
|
-
next
|
229
|
-
end
|
230
|
-
href = link[0]['href']
|
231
|
-
magazine_links << @base_url + '/' + href
|
232
|
-
end
|
233
|
-
end
|
234
|
-
magazine_links
|
235
|
-
end
|
236
|
-
|
237
|
-
def get_magazine_cover(year, volume)
|
238
|
-
title = @doc.css("#month_detail_#{year}_#{volume} .titel").children[0].to_s
|
239
|
-
img_url = ''
|
240
|
-
@doc.css("#month_detail_#{year}_#{volume} .cover a img").each do |node|
|
241
|
-
img_url = node['src']
|
242
|
-
end
|
243
|
-
return { :title => title, :img_url => @base_url + '/' + img_url }
|
244
|
-
end
|
245
|
-
|
246
|
-
def magazine_pdf_links(url)
|
247
|
-
magazine = ArchiveMagazine.new(url, @base_url)
|
248
|
-
magazine.get_magazine_pdf_links
|
249
|
-
end
|
250
|
-
|
251
|
-
class ArchiveMagazine
|
252
|
-
attr_accessor :url, :doc
|
253
|
-
|
254
|
-
def initialize(url, base_url, html = false)
|
255
|
-
puts "Parsing #{url}"
|
256
|
-
@url = url
|
257
|
-
@base_url = base_url
|
258
|
-
@doc = Nokogiri::HTML(open(url))
|
259
|
-
end
|
260
|
-
|
261
|
-
def get_magazine_pdf_links
|
262
|
-
[get_editorial_article_links, get_schwerpunkt_article_links].flatten
|
263
|
-
end
|
264
|
-
|
265
|
-
def get_schwerpunkt_article_links
|
266
|
-
get_links("div.articleList ul h4 a")
|
267
|
-
end
|
268
|
-
|
269
|
-
def get_editorial_article_links
|
270
|
-
get_links(".editorial-links li a")
|
271
|
-
end
|
272
|
-
|
273
|
-
def get_links(css_selector)
|
274
|
-
pdf_links = Array.new
|
275
|
-
link_nodes = @doc.css(css_selector)
|
276
|
-
link_nodes.each do |node|
|
277
|
-
article_link = @base_url + '/' + node['href']
|
278
|
-
article = MagazineArticle.new(article_link)
|
279
|
-
pdf_link = article.get_pdf_link
|
280
|
-
if pdf_link.nil? then
|
281
|
-
puts "------------------------------"
|
282
|
-
puts "No Content for: #{article_link}"
|
283
|
-
puts "------------------------------"
|
284
|
-
else
|
285
|
-
pdf_links << @base_url + '/' + pdf_link
|
286
|
-
end
|
287
|
-
end
|
288
|
-
pdf_links
|
289
|
-
end
|
290
|
-
|
291
|
-
class MagazineArticle
|
292
|
-
attr_accessor :url, :doc
|
293
|
-
|
294
|
-
def initialize(url)
|
295
|
-
puts "Parsing Article: #{url}"
|
296
|
-
@url = url
|
297
|
-
@doc = Nokogiri::HTML(open(url))
|
298
|
-
end
|
299
|
-
|
300
|
-
def get_pdf_link
|
301
|
-
link = @doc.css("div#sidebar ul li#downloaden a")
|
302
|
-
if link[0].nil? then
|
303
|
-
return nil
|
304
|
-
else
|
305
|
-
return link[0]['href']
|
306
|
-
end
|
307
|
-
end
|
308
|
-
end
|
309
|
-
end
|
310
|
-
end
|
311
|
-
end
|
312
|
-
end
|
8
|
+
module BrandEins; end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'thor'
|
3
|
+
|
4
|
+
module BrandEins
|
5
|
+
class CLI < Thor
|
6
|
+
map '--version' => :version
|
7
|
+
|
8
|
+
desc '--version', 'Displays current version'
|
9
|
+
def version
|
10
|
+
puts BrandEins::VERSION
|
11
|
+
end
|
12
|
+
|
13
|
+
desc 'download', 'Download past brand eins magazines (use `brandeins help download` to learn more about options)'
|
14
|
+
method_option :path, :type => :string
|
15
|
+
method_option :volume, :type => :numeric
|
16
|
+
method_option :all
|
17
|
+
method_option :year, :type => :numeric
|
18
|
+
def download
|
19
|
+
path = options.path ? File.expand_path(options.path) : Dir.pwd
|
20
|
+
year = options.year || Time.new.year
|
21
|
+
all = options.all
|
22
|
+
volume = options.volume
|
23
|
+
|
24
|
+
if volume.nil? and all.nil?
|
25
|
+
puts "If you want to download a specific volune use the --volume flag or use --all to download all volumes of a year"
|
26
|
+
else
|
27
|
+
downloader = BrandEins::Downloader.new(path, verbose: true)
|
28
|
+
if !all.nil?
|
29
|
+
downloader.get_magazines_of_year year
|
30
|
+
else
|
31
|
+
downloader.get_magazine year, volume
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
desc 'setup', 'Checks if all requirements for using brandeins gem are met'
|
37
|
+
method_option :help
|
38
|
+
def setup
|
39
|
+
setup = BrandEins::Setup.new
|
40
|
+
if !options.help.nil?
|
41
|
+
setup.help
|
42
|
+
else
|
43
|
+
setup.run
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|