brandeins-dl 0.0.3 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
data/Gemfile CHANGED
@@ -1,3 +1,11 @@
1
1
  source "http://rubygems.org"
2
2
 
3
+ group :test do
4
+ if ENV['RUBY_VERSION'][5,3] == '1.8'
5
+ gem 'minitest'
6
+ end
7
+ gem 'fakefs'
8
+ end
9
+
10
+
3
11
  gemspec
data/Rakefile CHANGED
@@ -1 +1,6 @@
1
- require "bundler/gem_tasks"
1
+ require 'bundler/gem_tasks'
2
+ require 'rake/testtask'
3
+
4
+ Rake::TestTask.new do |t|
5
+ t.pattern = 'test/*_test.rb'
6
+ end
@@ -1,3 +1,3 @@
1
1
  module BrandEins
2
- VERSION = "0.0.3"
2
+ VERSION = '0.0.4'
3
3
  end
data/lib/brandeins-dl.rb CHANGED
@@ -1,9 +1,13 @@
1
- require "brandeins-dl/version"
2
- require 'nokogiri'
3
- require 'open-uri'
4
- require 'uri'
5
- require 'fileutils'
6
- require 'thor'
1
+ %w(
2
+ brandeins-dl/version
3
+ nokogiri
4
+ open-uri
5
+ uri
6
+ fileutils
7
+ thor
8
+ ).each do |lib|
9
+ require lib
10
+ end
7
11
 
8
12
  module BrandEins
9
13
 
@@ -28,94 +32,99 @@ module BrandEins
28
32
 
29
33
  class Downloader
30
34
  attr_reader :archive
31
-
35
+
32
36
  def initialize(path)
33
- @url = "http://www.brandeins.de"
34
- @archive = ArchiveSite.new
37
+ @url = 'http://www.brandeins.de'
38
+ @archive = false
35
39
  @dl_dir = path
36
- @tmp_dir = path + "/tmp"
37
-
38
- check_download_path
40
+ @tmp_dir = path + '/tmp'
41
+ create_tmp_dirs
39
42
  end
40
43
 
41
- def check_download_path
42
- FileUtils.mkdir_p @tmp_dir unless File.exists?(@tmp_dir)
44
+ def setup
45
+ @archive = ArchiveSite.new @url
43
46
  end
44
-
47
+
45
48
  def get_magazines_of_year(year = 2000)
49
+ setup
46
50
  puts "Getting all brand eins magazines of a #{year}. This could take a while..."
47
- magazine_links_per_year = @archive.magazine_links_by_year(year)
51
+ magazine_links_per_year = @archive.get_magazine_links_by_year(year)
48
52
  magazine_links_per_year.each_with_index do |magazine_link, volume|
49
53
  puts "Parsing Volume #{volume} of #{year}"
50
54
  target_pdf = get_target_pdf(year, volume)
51
55
  get_magazine_by_link(magazine_link, target_pdf)
52
56
  end
53
57
  end
54
-
58
+
55
59
  def get_magazine(year = 2000, volume = 1)
60
+ setup
56
61
  puts "Parsing Volume #{volume} of #{year}"
57
62
  target_pdf = get_target_pdf(year, volume)
58
-
59
- magazine_links = @archive.magazine_links_by_year(year)
63
+
64
+ magazine_links = @archive.get_magazine_links_by_year(year)
60
65
  target_magazine_link = magazine_links[volume-1]
61
-
66
+
62
67
  get_magazine_by_link(target_magazine_link, target_pdf)
63
68
  end
64
-
69
+
70
+ private
71
+ def create_tmp_dirs
72
+ FileUtils.mkdir_p @tmp_dir unless File.directory?(@tmp_dir)
73
+ end
74
+
65
75
  def get_magazine_by_link(target_magazine_link, target_pdf)
66
76
  pdf_links = @archive.magazine_pdf_links(target_magazine_link)
67
77
  process_pdf_links(pdf_links, target_pdf)
68
78
  cleanup
69
79
  end
70
-
71
-
80
+
72
81
  def get_target_pdf(year, volume)
73
82
  "Brand-Eins-#{year}-#{volume}.pdf"
74
83
  end
75
-
84
+
76
85
  def process_pdf_links(pdf_links, target_pdf)
77
86
  pdf_downloader = PDFDownloader.new(pdf_links, @tmp_dir)
78
87
  pdf_files = pdf_downloader.download_all
79
88
  merge_pdfs(pdf_files, target_pdf)
80
89
  end
81
-
90
+
82
91
  def merge_pdfs(pdf_files, target_pdf)
83
92
  puts "Merging single PDFs now"
84
93
  pdf_sources = pdf_files.join(" ")
85
94
  system "pdftk #{pdf_sources} output #{@dl_dir}/#{target_pdf}"
86
95
  end
87
-
96
+
88
97
  def cleanup
89
98
  FileUtils.rm_r @tmp_dir
90
99
  end
91
-
100
+
92
101
  class PDFDownloader
93
-
102
+
94
103
  def initialize(pdf_links, dl_dir)
95
104
  @dl_dir = dl_dir
96
105
  @pdf_links = pdf_links
97
106
  end
98
-
107
+
99
108
  def download_all
100
109
  pdf_files = Array.new
101
110
  @pdf_links.each do |pdf_link|
102
111
  pdf_name = @dl_dir + '/' + File.basename(pdf_link)
103
112
  pdf_url = pdf_link
104
113
  download_pdf(pdf_url, pdf_name)
105
-
114
+
106
115
  pdf_files << pdf_name
107
116
  end
108
117
  pdf_files
109
118
  end
110
-
111
- private
112
-
119
+
120
+ private
121
+
113
122
  def download_pdf(pdf_url, filename)
114
123
  puts "Downloading PDF from #{pdf_url} to #{filename}"
115
124
  File.open(filename,'w') do |f|
116
125
  uri = URI.parse(pdf_url)
117
- Net::HTTP.start(uri.host,uri.port) do |http|
118
- http.request_get(uri.path) do |res|
126
+ Net::HTTP.start(uri.host,uri.port) do |http|
127
+ http.request_get(uri.path) do |res|
119
128
  res.read_body do |seg|
120
129
  f << seg
121
130
  #hack -- adjust to suit:
@@ -125,20 +134,26 @@ module BrandEins
125
134
  end
126
135
  end
127
136
  end
128
-
137
+
129
138
  end
130
-
139
+
131
140
  class ArchiveSite
132
-
133
141
  attr_accessor :doc
134
-
135
- def initialize
136
- @base_url = "http://www.brandeins.de"
142
+
143
+ def initialize(base_url, html = false)
144
+ @base_url = base_url
137
145
  @archive_url = @base_url + "/archiv.html"
146
+ if html
147
+ @doc = Nokogiri::HTML(html)
148
+ end
149
+ end
150
+
151
+ def setup
152
+ return if defined?(@doc) != nil
138
153
  @doc = Nokogiri::HTML(open(@archive_url))
139
154
  end
140
-
141
- def magazine_links_by_year(year = 2000)
155
+
156
+ def get_magazine_links_by_year(year = 2000)
142
157
  puts "Loading Magazine from year #{year}"
143
158
  magazine_nodes_with_meta = @doc.css(".jahrgang-#{year} ul li")
144
159
  magazine_links = Array.new
@@ -154,35 +169,35 @@ module BrandEins
154
169
  end
155
170
  magazine_links
156
171
  end
157
-
172
+
158
173
  def magazine_pdf_links(url)
159
- magazine = ArchiveMagazine.new(url)
174
+ magazine = ArchiveMagazine.new(url, @base_url)
160
175
  magazine.get_magazine_pdf_links
161
176
  end
162
-
177
+
163
178
  class ArchiveMagazine
164
179
  attr_accessor :url, :doc
165
-
166
- def initialize(url)
180
+
181
+ def initialize(url, base_url, html = false)
167
182
  puts "Parsing #{url}"
168
183
  @url = url
169
- @base_url = "http://www.brandeins.de"
184
+ @base_url = base_url
170
185
  @doc = Nokogiri::HTML(open(url))
171
186
  end
172
-
187
+
173
188
  def get_magazine_pdf_links
174
189
  [get_editorial_article_links, get_schwerpunkt_article_links].flatten
175
-
190
+
176
191
  end
177
-
192
+
178
193
  def get_schwerpunkt_article_links
179
194
  get_links("div.articleList ul h4 a")
180
195
  end
181
-
196
+
182
197
  def get_editorial_article_links
183
198
  get_links(".editorial-links li a")
184
199
  end
185
-
200
+
186
201
  def get_links(css_selector)
187
202
  pdf_links = Array.new
188
203
  link_nodes = @doc.css(css_selector)
@@ -200,16 +215,16 @@ module BrandEins
200
215
  end
201
216
  pdf_links
202
217
  end
203
-
218
+
204
219
  class MagazineArticle
205
220
  attr_accessor :url, :doc
206
-
221
+
207
222
  def initialize(url)
208
223
  puts "Parsing Article: #{url}"
209
224
  @url = url
210
225
  @doc = Nokogiri::HTML(open(url))
211
226
  end
212
-
227
+
213
228
  def get_pdf_link
214
229
  link = @doc.css("div#sidebar ul li#downloaden a")
215
230
  if link[0].nil? then
@@ -222,7 +237,6 @@ module BrandEins
222
237
  end
223
238
 
224
239
  end
225
-
226
240
  end
227
241
 
228
242
  end
metadata CHANGED
@@ -2,7 +2,7 @@
2
2
  name: brandeins-dl
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 0.0.3
5
+ version: 0.0.4
6
6
  platform: ruby
7
7
  authors:
8
8
  - Gregory Igelmund
@@ -10,10 +10,11 @@ autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
12
 
13
- date: 2012-02-26 00:00:00 Z
13
+ date: 2012-10-06 00:00:00 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: rake
17
+ prerelease: false
17
18
  requirement: &id001 !ruby/object:Gem::Requirement
18
19
  none: false
19
20
  requirements:
@@ -21,10 +22,10 @@ dependencies:
21
22
  - !ruby/object:Gem::Version
22
23
  version: "0"
23
24
  type: :runtime
24
- prerelease: false
25
25
  version_requirements: *id001
26
26
  - !ruby/object:Gem::Dependency
27
27
  name: thor
28
+ prerelease: false
28
29
  requirement: &id002 !ruby/object:Gem::Requirement
29
30
  none: false
30
31
  requirements:
@@ -32,10 +33,10 @@ dependencies:
32
33
  - !ruby/object:Gem::Version
33
34
  version: "0"
34
35
  type: :runtime
35
- prerelease: false
36
36
  version_requirements: *id002
37
37
  - !ruby/object:Gem::Dependency
38
38
  name: nokogiri
39
+ prerelease: false
39
40
  requirement: &id003 !ruby/object:Gem::Requirement
40
41
  none: false
41
42
  requirements:
@@ -43,7 +44,6 @@ dependencies:
43
44
  - !ruby/object:Gem::Version
44
45
  version: "0"
45
46
  type: :runtime
46
- prerelease: false
47
47
  version_requirements: *id003
48
48
  description: "BrandEins Downloader offers two commands: 'brandeins download YEAR' and 'brandeins download YEAR --volume=NUMBER'"
49
49
  email:
@@ -77,23 +77,17 @@ required_ruby_version: !ruby/object:Gem::Requirement
77
77
  requirements:
78
78
  - - ">="
79
79
  - !ruby/object:Gem::Version
80
- hash: -1677672259199041797
81
- segments:
82
- - 0
83
80
  version: "0"
84
81
  required_rubygems_version: !ruby/object:Gem::Requirement
85
82
  none: false
86
83
  requirements:
87
84
  - - ">="
88
85
  - !ruby/object:Gem::Version
89
- hash: -1677672259199041797
90
- segments:
91
- - 0
92
86
  version: "0"
93
87
  requirements: []
94
88
 
95
89
  rubyforge_project:
96
- rubygems_version: 1.8.6
90
+ rubygems_version: 1.8.15
97
91
  signing_key:
98
92
  specification_version: 3
99
93
  summary: BrandEins Downloader allows you to download past volumes of the Brand Eins magazine