brandeins 0.2.2 → 0.3.0.pre
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +5 -1
- data/.rspec +2 -0
- data/.rubocop.yml +5 -0
- data/.ruby-version +1 -0
- data/.travis.yml +11 -0
- data/Gemfile +7 -4
- data/Gemfile.lock +47 -21
- data/NOTES.md +6 -0
- data/Rakefile +15 -8
- data/bin/brandeins +3 -1
- data/brandeins.gemspec +0 -1
- data/lib/brandeins.rb +3 -5
- data/lib/brandeins/cli.rb +46 -34
- data/lib/brandeins/config.rb +18 -0
- data/lib/brandeins/kiosk.rb +100 -0
- data/lib/brandeins/merger/external/base.rb +16 -6
- data/lib/brandeins/merger/pdf_tools.rb +3 -6
- data/lib/brandeins/pages/archive.rb +91 -0
- data/lib/brandeins/pages/article.rb +37 -0
- data/lib/brandeins/pages/cover.rb +67 -0
- data/lib/brandeins/pages/magazine.rb +149 -0
- data/lib/brandeins/utils/cli_option_parser.rb +40 -0
- data/lib/brandeins/utils/cli_output.rb +100 -0
- data/lib/brandeins/utils/fetcher.rb +115 -0
- data/lib/brandeins/utils/merger.rb +41 -0
- data/lib/brandeins/version.rb +1 -1
- data/rubocop-todo.yml +141 -0
- data/spec/lib/brandeins/kiosk_spec.rb +66 -0
- data/spec/lib/brandeins/pages/archive_spec.rb +40 -0
- data/spec/lib/brandeins/pages/article_spec.rb +23 -0
- data/spec/lib/brandeins/pages/magazine_spec.rb +91 -0
- data/spec/lib/brandeins/utils/fetcher_spec.rb +8 -0
- data/spec/lib/brandeins_spec.rb +19 -0
- data/spec/spec_helper.rb +23 -0
- data/spec/support/capture_stdout.rb +12 -0
- data/spec/support/fixtures/archive.html +2365 -0
- data/spec/support/fixtures/artikel-masskonfektion-aus-plastik.html +254 -0
- data/spec/support/fixtures/artikel-schauspieler-daenemark.html +247 -0
- data/{test_support → spec/support}/fixtures/cover.jpg +0 -0
- data/spec/support/fixtures/editorial.html +236 -0
- data/spec/support/fixtures/just-a.pdf +0 -0
- data/spec/support/fixtures/magazine-1-2013.html +242 -0
- data/spec/support/fixtures/magazine-cover-fallback.html +1610 -0
- data/spec/support/fixtures/magazine-with-cover.html +1416 -0
- metadata +68 -61
- data/.rvmrc +0 -48
- data/lib/brandeins/downloader.rb +0 -111
- data/lib/brandeins/errors.rb +0 -5
- data/lib/brandeins/parser/archive_site.rb +0 -54
- data/lib/brandeins/parser/article_site.rb +0 -26
- data/lib/brandeins/parser/magazine_site.rb +0 -49
- data/lib/brandeins/setup.rb +0 -38
- data/specs/brandeins_spec.rb +0 -52
- data/specs/spec_helper.rb +0 -1
- data/test/brandeins_test.rb +0 -65
- data/test/helper.rb +0 -1
- data/test_support/capture_stdout.rb +0 -12
- data/test_support/fixtures/brandeins_archiv.html +0 -50
@@ -1,5 +1,9 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
|
3
|
+
require 'shellwords'
|
4
|
+
|
5
|
+
require_relative '../../utils/cli_output'
|
6
|
+
|
3
7
|
module BrandEins
|
4
8
|
module Merger
|
5
9
|
module External
|
@@ -15,19 +19,25 @@ module BrandEins
|
|
15
19
|
def args; raise "Must be implemtented by the subclasses"; end
|
16
20
|
|
17
21
|
def merge_pdf_files(pdf_files, target_pdf)
|
22
|
+
# TODO: This is terrible. Use shellwords.shellescape!
|
18
23
|
begin
|
19
|
-
pdf_files_arg = pdf_files.map {|pdf_file| "'#{pdf_file}'" }.join ' '
|
20
|
-
args = self.args.join(' ').gsub(/__pdf_files__/, pdf_files_arg).gsub(/__target_pdf__/, target_pdf)
|
21
|
-
|
22
|
-
|
24
|
+
pdf_files_arg = pdf_files.map {|pdf_file| "'#{pdf_file.to_s}'" }.join ' '
|
25
|
+
args = self.args.join(' ').gsub(/__pdf_files__/, pdf_files_arg).gsub(/__target_pdf__/, "'#{target_pdf.to_s}'")
|
26
|
+
cli.info "Running PDF Merger for #{target_pdf}"
|
27
|
+
cli.debug "Executing: `#{cmd} #{args}`" do
|
28
|
+
_exec("#{cmd} #{args}")
|
29
|
+
end
|
23
30
|
rescue Exception => e
|
24
|
-
|
31
|
+
cli.error "Error when merging file: #{e.inspect}"
|
25
32
|
return false
|
26
33
|
end
|
27
34
|
return true
|
28
35
|
end
|
29
36
|
|
30
|
-
|
37
|
+
def cli
|
38
|
+
@cli ||= BrandEins::Utils::CliOutput.instance
|
39
|
+
end
|
40
|
+
|
31
41
|
def _exec (cmd)
|
32
42
|
IO.popen(cmd) do |io|
|
33
43
|
io.each do |line|
|
@@ -1,7 +1,7 @@
|
|
1
1
|
# encoding: utf-8
|
2
|
-
|
3
|
-
|
4
|
-
|
2
|
+
require_relative 'external/base'
|
3
|
+
require_relative 'external/pdftk'
|
4
|
+
require_relative 'external/ghostscript_windows'
|
5
5
|
|
6
6
|
module BrandEins
|
7
7
|
module Merger
|
@@ -14,8 +14,6 @@ module BrandEins
|
|
14
14
|
get_klass_for_external(env).new
|
15
15
|
end
|
16
16
|
|
17
|
-
private
|
18
|
-
|
19
17
|
def get_klass_for_external(env)
|
20
18
|
if env[:os].include? 'w32'
|
21
19
|
BrandEins::Merger::External::GhostscriptWindows
|
@@ -23,7 +21,6 @@ module BrandEins
|
|
23
21
|
BrandEins::Merger::External::Pdftk
|
24
22
|
end
|
25
23
|
end
|
26
|
-
|
27
24
|
end
|
28
25
|
|
29
26
|
end
|
@@ -0,0 +1,91 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'nokogiri'
|
4
|
+
require 'english'
|
5
|
+
|
6
|
+
require_relative '../config'
|
7
|
+
require_relative '../pages/magazine'
|
8
|
+
|
9
|
+
module BrandEins
|
10
|
+
module Pages
|
11
|
+
# Usage of +Archive+
|
12
|
+
#
|
13
|
+
# page = Archive.new(html)
|
14
|
+
# page.magazines_for_year(2000)
|
15
|
+
# => [Magazine, Magazine, ...]
|
16
|
+
#
|
17
|
+
# page.magazine_for(month: 1, year: 2000)
|
18
|
+
# => Magazine
|
19
|
+
#
|
20
|
+
# page.magazine_for(month: 13, year: 9999)
|
21
|
+
# => nil
|
22
|
+
#
|
23
|
+
class Archive
|
24
|
+
attr_reader :html
|
25
|
+
|
26
|
+
def initialize(opts = {})
|
27
|
+
@html = opts.delete(:html)
|
28
|
+
@magazines = {}
|
29
|
+
end
|
30
|
+
|
31
|
+
def html
|
32
|
+
cli.info "Loading the archive" do
|
33
|
+
@html ||= fetcher.fetch(archive_url)
|
34
|
+
end
|
35
|
+
rescue BrandEins::Utils::Fetcher::ContentNotFetchedError => e
|
36
|
+
raise e, "Could not download the archiv.html (May be the URL changed?)\n=> Original error: #{e.message}", e.backtrace
|
37
|
+
end
|
38
|
+
|
39
|
+
def magazines_for_year(year)
|
40
|
+
@magazines[year] ||= parse_magazines_for_year(year)
|
41
|
+
end
|
42
|
+
|
43
|
+
def magazine_for(month: nil, year: nil)
|
44
|
+
magazines_for_year(year)[month]
|
45
|
+
end
|
46
|
+
|
47
|
+
private
|
48
|
+
|
49
|
+
def document
|
50
|
+
@document ||= Nokogiri::HTML(html)
|
51
|
+
end
|
52
|
+
|
53
|
+
def parse_magazines_for_year(year)
|
54
|
+
anchor = document.css("h3#anchor-#{year}").first
|
55
|
+
root = anchor.xpath('../../..')
|
56
|
+
root.css('article figure').each_with_object({}) do |figure, magazines|
|
57
|
+
magazine_url = extract_magazine_url(figure)
|
58
|
+
magazine_month = extract_magazine_month(figure)
|
59
|
+
magazine = BrandEins::Pages::Magazine.new(url: magazine_url)
|
60
|
+
magazines[magazine_month] = magazine
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
def extract_magazine_url(figure)
|
65
|
+
brandeins_url + '/' + figure.css('a.read.more').first['href']
|
66
|
+
end
|
67
|
+
|
68
|
+
def extract_magazine_month(figure)
|
69
|
+
meta = figure.css('.meta').first
|
70
|
+
meta.text.match(/(?:.+)(\d{2})\/(?:.+)/) && $LAST_PAREN_MATCH.to_i
|
71
|
+
end
|
72
|
+
|
73
|
+
def brandeins_url
|
74
|
+
BrandEins::Config['base_uri']
|
75
|
+
end
|
76
|
+
|
77
|
+
def archive_url
|
78
|
+
BrandEins::Config['archive_uri']
|
79
|
+
end
|
80
|
+
|
81
|
+
def fetcher
|
82
|
+
@fetcher ||= BrandEins::Utils::Fetcher.instance
|
83
|
+
end
|
84
|
+
|
85
|
+
def cli
|
86
|
+
@cli ||= BrandEins::Utils::CliOutput.instance
|
87
|
+
end
|
88
|
+
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'nokogiri'
|
4
|
+
|
5
|
+
require_relative '../config'
|
6
|
+
|
7
|
+
module BrandEins
|
8
|
+
module Pages
|
9
|
+
class Article
|
10
|
+
|
11
|
+
def initialize(html)
|
12
|
+
@html = html
|
13
|
+
end
|
14
|
+
|
15
|
+
def pdf_url
|
16
|
+
if node = document.css('a[href$=pdf]').first
|
17
|
+
brandeins_url + '/' + node['href']
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def title
|
22
|
+
if node = document.css('h2.csc-firstHeader').first
|
23
|
+
node.children.first.text.gsub("\n", '')
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def document
|
28
|
+
@document ||= Nokogiri::HTML(@html)
|
29
|
+
end
|
30
|
+
|
31
|
+
def brandeins_url
|
32
|
+
BrandEins::Config['base_uri']
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'tempfile'
|
4
|
+
require 'prawn'
|
5
|
+
|
6
|
+
require_relative '../utils/fetcher'
|
7
|
+
|
8
|
+
module BrandEins
|
9
|
+
module Pages
|
10
|
+
class Cover
|
11
|
+
|
12
|
+
def initialize(magazine)
|
13
|
+
@magazine = magazine
|
14
|
+
end
|
15
|
+
|
16
|
+
def cover_image_url
|
17
|
+
@magazine.cover_url
|
18
|
+
end
|
19
|
+
|
20
|
+
def cover_title
|
21
|
+
@magazine.title
|
22
|
+
end
|
23
|
+
|
24
|
+
def to_pdf
|
25
|
+
cover_image = download_cover_image
|
26
|
+
cover_pdf = create_cover_pdf(cover_image)
|
27
|
+
end
|
28
|
+
|
29
|
+
def download_cover_image
|
30
|
+
fetcher.fetch(cover_image_url)
|
31
|
+
end
|
32
|
+
|
33
|
+
def create_cover_pdf(image)
|
34
|
+
Prawn::Document.new do |pdf|
|
35
|
+
pdf.text '<font size="18"><b>' + cover_title + '</b></font>',
|
36
|
+
align: :center,
|
37
|
+
inline_format: true
|
38
|
+
if image
|
39
|
+
# TODO: get Null Byte?
|
40
|
+
# pdf.image image, position: :center, vposition: :center
|
41
|
+
end
|
42
|
+
end.render
|
43
|
+
end
|
44
|
+
|
45
|
+
def save_to(path)
|
46
|
+
cover_file_path = cover_file_path_for_path(path)
|
47
|
+
return cover_file_path if File.exists? cover_file_path
|
48
|
+
File.binwrite(cover_file_path, to_pdf)
|
49
|
+
cover_file_path
|
50
|
+
rescue BrandEins::Utils::Fetcher::ContentNotFetchedError => e
|
51
|
+
end
|
52
|
+
|
53
|
+
def cover_file_path_for_path(path)
|
54
|
+
Pathname.new(path) + file_name
|
55
|
+
end
|
56
|
+
|
57
|
+
def file_name
|
58
|
+
"magazine-cover-#{@magazine.month}-#{@magazine.year}.pdf"
|
59
|
+
end
|
60
|
+
|
61
|
+
def fetcher
|
62
|
+
@fetcher ||= BrandEins::Utils::Fetcher.instance
|
63
|
+
end
|
64
|
+
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
@@ -0,0 +1,149 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'english'
|
4
|
+
require 'nokogiri'
|
5
|
+
|
6
|
+
require_relative '../config'
|
7
|
+
require_relative '../utils/fetcher'
|
8
|
+
require_relative '../utils/cli_output'
|
9
|
+
require_relative '../pages/article'
|
10
|
+
|
11
|
+
module BrandEins
|
12
|
+
module Pages
|
13
|
+
# Usage of +MagazinePage+
|
14
|
+
#
|
15
|
+
# page = BrandEins::Parser::MagazinePage.new(html)
|
16
|
+
# page.article_pdf_urls
|
17
|
+
# => ['http://example.com/archive/article1.pdf',
|
18
|
+
# 'http://example.com/archive/article2.pdf',
|
19
|
+
# ...
|
20
|
+
# ]
|
21
|
+
#
|
22
|
+
# page.cover_url
|
23
|
+
# => 'http://example.com/archive/cover1.png'
|
24
|
+
#
|
25
|
+
class Magazine
|
26
|
+
|
27
|
+
def initialize(opts = {})
|
28
|
+
if opts.is_a? String
|
29
|
+
opts = { html: opts }
|
30
|
+
end
|
31
|
+
@html = opts[:html]
|
32
|
+
@url = opts[:url]
|
33
|
+
end
|
34
|
+
|
35
|
+
def html
|
36
|
+
@html ||= fetcher.fetch(url)
|
37
|
+
end
|
38
|
+
|
39
|
+
def article_urls
|
40
|
+
@article_urls ||= parse_article_urls
|
41
|
+
end
|
42
|
+
|
43
|
+
def article_pdf_urls
|
44
|
+
@article_pdf_urls ||= article_urls.map do |article_url|
|
45
|
+
article_html = fetcher.fetch(article_url)
|
46
|
+
article = BrandEins::Pages::Article.new(article_html)
|
47
|
+
article.pdf_url or cli.info "No PDF for: \"#{article.title}\""
|
48
|
+
end.compact
|
49
|
+
end
|
50
|
+
|
51
|
+
def cover_url
|
52
|
+
@cover_url ||= parse_cover_image_url
|
53
|
+
end
|
54
|
+
|
55
|
+
def title
|
56
|
+
@title ||= document.css('.current-issue h2').children.first.text
|
57
|
+
end
|
58
|
+
|
59
|
+
def year
|
60
|
+
@year ||= parse_year
|
61
|
+
end
|
62
|
+
|
63
|
+
def month
|
64
|
+
@month ||= parse_month
|
65
|
+
end
|
66
|
+
|
67
|
+
def url
|
68
|
+
@url ||= parse_url
|
69
|
+
end
|
70
|
+
|
71
|
+
def document
|
72
|
+
@document ||= Nokogiri::HTML(html)
|
73
|
+
end
|
74
|
+
|
75
|
+
def parse_article_urls
|
76
|
+
document.css('.ihv_list > a').each_with_object([]) do |node, links|
|
77
|
+
links << brandeins_url + '/' + node['href']
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
def parse_cover_image_url
|
82
|
+
img_tag = primary_cover_image || secondary_cover_image
|
83
|
+
brandeins_url + '/' + img_tag.attributes['src'].text if img_tag
|
84
|
+
end
|
85
|
+
|
86
|
+
def secondary_cover_image
|
87
|
+
document.css('.preparedTeaserImage img').first
|
88
|
+
end
|
89
|
+
|
90
|
+
def primary_cover_image
|
91
|
+
document.css('.coverImage img').first
|
92
|
+
end
|
93
|
+
|
94
|
+
def parse_year
|
95
|
+
if issue_text.match /Ausgabe (?:.+)\/(.+)/
|
96
|
+
$LAST_PAREN_MATCH.to_i
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
def parse_month
|
101
|
+
if issue_text.match /Ausgabe (.+)\/(?:.+)/
|
102
|
+
$LAST_PAREN_MATCH.to_i
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
def issue_text
|
107
|
+
node = document.css('.current-issue h3').last
|
108
|
+
text = node.children.first.text
|
109
|
+
end
|
110
|
+
|
111
|
+
def parse_url
|
112
|
+
document.css('[property="og:url"]').first.attributes['content'].value
|
113
|
+
end
|
114
|
+
|
115
|
+
def brandeins_url
|
116
|
+
BrandEins::Config['base_uri']
|
117
|
+
end
|
118
|
+
|
119
|
+
def fetcher
|
120
|
+
@fetcher ||= BrandEins::Utils::Fetcher.instance
|
121
|
+
end
|
122
|
+
|
123
|
+
def save_articles_to(path)
|
124
|
+
article_pdf_urls.each_with_object([]) do |pdf_url, pdf_files|
|
125
|
+
pdf = fetcher.fetch(pdf_url)
|
126
|
+
file_path = file_path_for_pdf(path, pdf_url)
|
127
|
+
File.binwrite(file_path, pdf)
|
128
|
+
pdf_files << file_path
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
def file_path_for_pdf(path, pdf_url)
|
133
|
+
target_path = Pathname.new(path)
|
134
|
+
target_path.mkpath
|
135
|
+
file_path = target_path + file_name_for_pdf_url(pdf_url)
|
136
|
+
end
|
137
|
+
|
138
|
+
def file_name_for_pdf_url(pdf_url)
|
139
|
+
uri_path = URI(pdf_url).path
|
140
|
+
file_name = File.basename(uri_path)
|
141
|
+
end
|
142
|
+
|
143
|
+
def cli
|
144
|
+
@cli ||= BrandEins::Utils::CliOutput.instance
|
145
|
+
end
|
146
|
+
|
147
|
+
end
|
148
|
+
end
|
149
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
require 'ostruct'
|
2
|
+
|
3
|
+
module BrandEins
|
4
|
+
class CliOptionParser
|
5
|
+
def self.parse(args = ARGV)
|
6
|
+
options = OpenStruct.new
|
7
|
+
opt_parser = OptionParser.new do |opts|
|
8
|
+
opts.banner = "Usage: brandeins download --month n --year n"
|
9
|
+
opts.separator ""
|
10
|
+
|
11
|
+
opts.on('-m MONTH', '--month month', Integer, "The publication month of the magazine. E.g. for may: '5'") do |month|
|
12
|
+
options.month = month
|
13
|
+
end
|
14
|
+
|
15
|
+
opts.on('-y YEAR', '--year YEAR', Integer, "The publication year of the magazine. E.g. the current year '#{Time.now.year}'") do |year|
|
16
|
+
options.year = year
|
17
|
+
end
|
18
|
+
|
19
|
+
opts.on('--path [PATH]', 'The path where to download the magazine to. Default is the current directory.') do |path|
|
20
|
+
options.path = path
|
21
|
+
end
|
22
|
+
|
23
|
+
opts.on('-h', '--help', 'Show this message') do |help|
|
24
|
+
options.help = help
|
25
|
+
end
|
26
|
+
|
27
|
+
opts.on('-v', '--verbose', 'Be verbose') do |verbose|
|
28
|
+
options.verbose = verbose
|
29
|
+
end
|
30
|
+
|
31
|
+
opts.on('--version', 'Show the version') do |version|
|
32
|
+
options.version = version
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
opt_parser.parse!(args)
|
37
|
+
options
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|