brandeins 0.2.2 → 0.3.0.pre

Sign up to get free protection for your applications and to get access to all the features.
Files changed (59) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +5 -1
  3. data/.rspec +2 -0
  4. data/.rubocop.yml +5 -0
  5. data/.ruby-version +1 -0
  6. data/.travis.yml +11 -0
  7. data/Gemfile +7 -4
  8. data/Gemfile.lock +47 -21
  9. data/NOTES.md +6 -0
  10. data/Rakefile +15 -8
  11. data/bin/brandeins +3 -1
  12. data/brandeins.gemspec +0 -1
  13. data/lib/brandeins.rb +3 -5
  14. data/lib/brandeins/cli.rb +46 -34
  15. data/lib/brandeins/config.rb +18 -0
  16. data/lib/brandeins/kiosk.rb +100 -0
  17. data/lib/brandeins/merger/external/base.rb +16 -6
  18. data/lib/brandeins/merger/pdf_tools.rb +3 -6
  19. data/lib/brandeins/pages/archive.rb +91 -0
  20. data/lib/brandeins/pages/article.rb +37 -0
  21. data/lib/brandeins/pages/cover.rb +67 -0
  22. data/lib/brandeins/pages/magazine.rb +149 -0
  23. data/lib/brandeins/utils/cli_option_parser.rb +40 -0
  24. data/lib/brandeins/utils/cli_output.rb +100 -0
  25. data/lib/brandeins/utils/fetcher.rb +115 -0
  26. data/lib/brandeins/utils/merger.rb +41 -0
  27. data/lib/brandeins/version.rb +1 -1
  28. data/rubocop-todo.yml +141 -0
  29. data/spec/lib/brandeins/kiosk_spec.rb +66 -0
  30. data/spec/lib/brandeins/pages/archive_spec.rb +40 -0
  31. data/spec/lib/brandeins/pages/article_spec.rb +23 -0
  32. data/spec/lib/brandeins/pages/magazine_spec.rb +91 -0
  33. data/spec/lib/brandeins/utils/fetcher_spec.rb +8 -0
  34. data/spec/lib/brandeins_spec.rb +19 -0
  35. data/spec/spec_helper.rb +23 -0
  36. data/spec/support/capture_stdout.rb +12 -0
  37. data/spec/support/fixtures/archive.html +2365 -0
  38. data/spec/support/fixtures/artikel-masskonfektion-aus-plastik.html +254 -0
  39. data/spec/support/fixtures/artikel-schauspieler-daenemark.html +247 -0
  40. data/{test_support → spec/support}/fixtures/cover.jpg +0 -0
  41. data/spec/support/fixtures/editorial.html +236 -0
  42. data/spec/support/fixtures/just-a.pdf +0 -0
  43. data/spec/support/fixtures/magazine-1-2013.html +242 -0
  44. data/spec/support/fixtures/magazine-cover-fallback.html +1610 -0
  45. data/spec/support/fixtures/magazine-with-cover.html +1416 -0
  46. metadata +68 -61
  47. data/.rvmrc +0 -48
  48. data/lib/brandeins/downloader.rb +0 -111
  49. data/lib/brandeins/errors.rb +0 -5
  50. data/lib/brandeins/parser/archive_site.rb +0 -54
  51. data/lib/brandeins/parser/article_site.rb +0 -26
  52. data/lib/brandeins/parser/magazine_site.rb +0 -49
  53. data/lib/brandeins/setup.rb +0 -38
  54. data/specs/brandeins_spec.rb +0 -52
  55. data/specs/spec_helper.rb +0 -1
  56. data/test/brandeins_test.rb +0 -65
  57. data/test/helper.rb +0 -1
  58. data/test_support/capture_stdout.rb +0 -12
  59. data/test_support/fixtures/brandeins_archiv.html +0 -50
@@ -1,5 +1,9 @@
1
1
  # encoding: utf-8
2
2
 
3
+ require 'shellwords'
4
+
5
+ require_relative '../../utils/cli_output'
6
+
3
7
  module BrandEins
4
8
  module Merger
5
9
  module External
@@ -15,19 +19,25 @@ module BrandEins
15
19
  def args; raise "Must be implemtented by the subclasses"; end
16
20
 
17
21
  def merge_pdf_files(pdf_files, target_pdf)
22
+ # TODO: This is terrible. Use shellwords.shellescape!
18
23
  begin
19
- pdf_files_arg = pdf_files.map {|pdf_file| "'#{pdf_file}'" }.join ' '
20
- args = self.args.join(' ').gsub(/__pdf_files__/, pdf_files_arg).gsub(/__target_pdf__/, target_pdf)
21
- puts "executing: #{cmd} #{args}"
22
- _exec("#{cmd} #{args}")
24
+ pdf_files_arg = pdf_files.map {|pdf_file| "'#{pdf_file.to_s}'" }.join ' '
25
+ args = self.args.join(' ').gsub(/__pdf_files__/, pdf_files_arg).gsub(/__target_pdf__/, "'#{target_pdf.to_s}'")
26
+ cli.info "Running PDF Merger for #{target_pdf}"
27
+ cli.debug "Executing: `#{cmd} #{args}`" do
28
+ _exec("#{cmd} #{args}")
29
+ end
23
30
  rescue Exception => e
24
- puts "error: #{e.inspect}"
31
+ cli.error "Error when merging file: #{e.inspect}"
25
32
  return false
26
33
  end
27
34
  return true
28
35
  end
29
36
 
30
- private
37
+ def cli
38
+ @cli ||= BrandEins::Utils::CliOutput.instance
39
+ end
40
+
31
41
  def _exec (cmd)
32
42
  IO.popen(cmd) do |io|
33
43
  io.each do |line|
@@ -1,7 +1,7 @@
1
1
  # encoding: utf-8
2
- require 'brandeins/merger/external/base'
3
- require 'brandeins/merger/external/pdftk'
4
- require 'brandeins/merger/external/ghostscript_windows'
2
+ require_relative 'external/base'
3
+ require_relative 'external/pdftk'
4
+ require_relative 'external/ghostscript_windows'
5
5
 
6
6
  module BrandEins
7
7
  module Merger
@@ -14,8 +14,6 @@ module BrandEins
14
14
  get_klass_for_external(env).new
15
15
  end
16
16
 
17
- private
18
-
19
17
  def get_klass_for_external(env)
20
18
  if env[:os].include? 'w32'
21
19
  BrandEins::Merger::External::GhostscriptWindows
@@ -23,7 +21,6 @@ module BrandEins
23
21
  BrandEins::Merger::External::Pdftk
24
22
  end
25
23
  end
26
-
27
24
  end
28
25
 
29
26
  end
@@ -0,0 +1,91 @@
1
+ # encoding: utf-8
2
+
3
+ require 'nokogiri'
4
+ require 'english'
5
+
6
+ require_relative '../config'
7
+ require_relative '../pages/magazine'
8
+
9
+ module BrandEins
10
+ module Pages
11
+ # Usage of +Archive+
12
+ #
13
+ # page = Archive.new(html)
14
+ # page.magazines_for_year(2000)
15
+ # => [Magazine, Magazine, ...]
16
+ #
17
+ # page.magazine_for(month: 1, year: 2000)
18
+ # => Magazine
19
+ #
20
+ # page.magazine_for(month: 13, year: 9999)
21
+ # => nil
22
+ #
23
+ class Archive
24
+ attr_reader :html
25
+
26
+ def initialize(opts = {})
27
+ @html = opts.delete(:html)
28
+ @magazines = {}
29
+ end
30
+
31
+ def html
32
+ cli.info "Loading the archive" do
33
+ @html ||= fetcher.fetch(archive_url)
34
+ end
35
+ rescue BrandEins::Utils::Fetcher::ContentNotFetchedError => e
36
+ raise e, "Could not download the archiv.html (May be the URL changed?)\n=> Original error: #{e.message}", e.backtrace
37
+ end
38
+
39
+ def magazines_for_year(year)
40
+ @magazines[year] ||= parse_magazines_for_year(year)
41
+ end
42
+
43
+ def magazine_for(month: nil, year: nil)
44
+ magazines_for_year(year)[month]
45
+ end
46
+
47
+ private
48
+
49
+ def document
50
+ @document ||= Nokogiri::HTML(html)
51
+ end
52
+
53
+ def parse_magazines_for_year(year)
54
+ anchor = document.css("h3#anchor-#{year}").first
55
+ root = anchor.xpath('../../..')
56
+ root.css('article figure').each_with_object({}) do |figure, magazines|
57
+ magazine_url = extract_magazine_url(figure)
58
+ magazine_month = extract_magazine_month(figure)
59
+ magazine = BrandEins::Pages::Magazine.new(url: magazine_url)
60
+ magazines[magazine_month] = magazine
61
+ end
62
+ end
63
+
64
+ def extract_magazine_url(figure)
65
+ brandeins_url + '/' + figure.css('a.read.more').first['href']
66
+ end
67
+
68
+ def extract_magazine_month(figure)
69
+ meta = figure.css('.meta').first
70
+ meta.text.match(/(?:.+)(\d{2})\/(?:.+)/) && $LAST_PAREN_MATCH.to_i
71
+ end
72
+
73
+ def brandeins_url
74
+ BrandEins::Config['base_uri']
75
+ end
76
+
77
+ def archive_url
78
+ BrandEins::Config['archive_uri']
79
+ end
80
+
81
+ def fetcher
82
+ @fetcher ||= BrandEins::Utils::Fetcher.instance
83
+ end
84
+
85
+ def cli
86
+ @cli ||= BrandEins::Utils::CliOutput.instance
87
+ end
88
+
89
+ end
90
+ end
91
+ end
@@ -0,0 +1,37 @@
1
+ # encoding: utf-8
2
+
3
+ require 'nokogiri'
4
+
5
+ require_relative '../config'
6
+
7
+ module BrandEins
8
+ module Pages
9
+ class Article
10
+
11
+ def initialize(html)
12
+ @html = html
13
+ end
14
+
15
+ def pdf_url
16
+ if node = document.css('a[href$=pdf]').first
17
+ brandeins_url + '/' + node['href']
18
+ end
19
+ end
20
+
21
+ def title
22
+ if node = document.css('h2.csc-firstHeader').first
23
+ node.children.first.text.gsub("\n", '')
24
+ end
25
+ end
26
+
27
+ def document
28
+ @document ||= Nokogiri::HTML(@html)
29
+ end
30
+
31
+ def brandeins_url
32
+ BrandEins::Config['base_uri']
33
+ end
34
+
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,67 @@
1
+ # encoding: utf-8
2
+
3
+ require 'tempfile'
4
+ require 'prawn'
5
+
6
+ require_relative '../utils/fetcher'
7
+
8
+ module BrandEins
9
+ module Pages
10
+ class Cover
11
+
12
+ def initialize(magazine)
13
+ @magazine = magazine
14
+ end
15
+
16
+ def cover_image_url
17
+ @magazine.cover_url
18
+ end
19
+
20
+ def cover_title
21
+ @magazine.title
22
+ end
23
+
24
+ def to_pdf
25
+ cover_image = download_cover_image
26
+ cover_pdf = create_cover_pdf(cover_image)
27
+ end
28
+
29
+ def download_cover_image
30
+ fetcher.fetch(cover_image_url)
31
+ end
32
+
33
+ def create_cover_pdf(image)
34
+ Prawn::Document.new do |pdf|
35
+ pdf.text '<font size="18"><b>' + cover_title + '</b></font>',
36
+ align: :center,
37
+ inline_format: true
38
+ if image
39
+ # TODO: get Null Byte?
40
+ # pdf.image image, position: :center, vposition: :center
41
+ end
42
+ end.render
43
+ end
44
+
45
+ def save_to(path)
46
+ cover_file_path = cover_file_path_for_path(path)
47
+ return cover_file_path if File.exists? cover_file_path
48
+ File.binwrite(cover_file_path, to_pdf)
49
+ cover_file_path
50
+ rescue BrandEins::Utils::Fetcher::ContentNotFetchedError => e
51
+ end
52
+
53
+ def cover_file_path_for_path(path)
54
+ Pathname.new(path) + file_name
55
+ end
56
+
57
+ def file_name
58
+ "magazine-cover-#{@magazine.month}-#{@magazine.year}.pdf"
59
+ end
60
+
61
+ def fetcher
62
+ @fetcher ||= BrandEins::Utils::Fetcher.instance
63
+ end
64
+
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,149 @@
1
+ # encoding: utf-8
2
+
3
+ require 'english'
4
+ require 'nokogiri'
5
+
6
+ require_relative '../config'
7
+ require_relative '../utils/fetcher'
8
+ require_relative '../utils/cli_output'
9
+ require_relative '../pages/article'
10
+
11
+ module BrandEins
12
+ module Pages
13
+ # Usage of +MagazinePage+
14
+ #
15
+ # page = BrandEins::Parser::MagazinePage.new(html)
16
+ # page.article_pdf_urls
17
+ # => ['http://example.com/archive/article1.pdf',
18
+ # 'http://example.com/archive/article2.pdf',
19
+ # ...
20
+ # ]
21
+ #
22
+ # page.cover_url
23
+ # => 'http://example.com/archive/cover1.png'
24
+ #
25
+ class Magazine
26
+
27
+ def initialize(opts = {})
28
+ if opts.is_a? String
29
+ opts = { html: opts }
30
+ end
31
+ @html = opts[:html]
32
+ @url = opts[:url]
33
+ end
34
+
35
+ def html
36
+ @html ||= fetcher.fetch(url)
37
+ end
38
+
39
+ def article_urls
40
+ @article_urls ||= parse_article_urls
41
+ end
42
+
43
+ def article_pdf_urls
44
+ @article_pdf_urls ||= article_urls.map do |article_url|
45
+ article_html = fetcher.fetch(article_url)
46
+ article = BrandEins::Pages::Article.new(article_html)
47
+ article.pdf_url or cli.info "No PDF for: \"#{article.title}\""
48
+ end.compact
49
+ end
50
+
51
+ def cover_url
52
+ @cover_url ||= parse_cover_image_url
53
+ end
54
+
55
+ def title
56
+ @title ||= document.css('.current-issue h2').children.first.text
57
+ end
58
+
59
+ def year
60
+ @year ||= parse_year
61
+ end
62
+
63
+ def month
64
+ @month ||= parse_month
65
+ end
66
+
67
+ def url
68
+ @url ||= parse_url
69
+ end
70
+
71
+ def document
72
+ @document ||= Nokogiri::HTML(html)
73
+ end
74
+
75
+ def parse_article_urls
76
+ document.css('.ihv_list > a').each_with_object([]) do |node, links|
77
+ links << brandeins_url + '/' + node['href']
78
+ end
79
+ end
80
+
81
+ def parse_cover_image_url
82
+ img_tag = primary_cover_image || secondary_cover_image
83
+ brandeins_url + '/' + img_tag.attributes['src'].text if img_tag
84
+ end
85
+
86
+ def secondary_cover_image
87
+ document.css('.preparedTeaserImage img').first
88
+ end
89
+
90
+ def primary_cover_image
91
+ document.css('.coverImage img').first
92
+ end
93
+
94
+ def parse_year
95
+ if issue_text.match /Ausgabe (?:.+)\/(.+)/
96
+ $LAST_PAREN_MATCH.to_i
97
+ end
98
+ end
99
+
100
+ def parse_month
101
+ if issue_text.match /Ausgabe (.+)\/(?:.+)/
102
+ $LAST_PAREN_MATCH.to_i
103
+ end
104
+ end
105
+
106
+ def issue_text
107
+ node = document.css('.current-issue h3').last
108
+ text = node.children.first.text
109
+ end
110
+
111
+ def parse_url
112
+ document.css('[property="og:url"]').first.attributes['content'].value
113
+ end
114
+
115
+ def brandeins_url
116
+ BrandEins::Config['base_uri']
117
+ end
118
+
119
+ def fetcher
120
+ @fetcher ||= BrandEins::Utils::Fetcher.instance
121
+ end
122
+
123
+ def save_articles_to(path)
124
+ article_pdf_urls.each_with_object([]) do |pdf_url, pdf_files|
125
+ pdf = fetcher.fetch(pdf_url)
126
+ file_path = file_path_for_pdf(path, pdf_url)
127
+ File.binwrite(file_path, pdf)
128
+ pdf_files << file_path
129
+ end
130
+ end
131
+
132
+ def file_path_for_pdf(path, pdf_url)
133
+ target_path = Pathname.new(path)
134
+ target_path.mkpath
135
+ file_path = target_path + file_name_for_pdf_url(pdf_url)
136
+ end
137
+
138
+ def file_name_for_pdf_url(pdf_url)
139
+ uri_path = URI(pdf_url).path
140
+ file_name = File.basename(uri_path)
141
+ end
142
+
143
+ def cli
144
+ @cli ||= BrandEins::Utils::CliOutput.instance
145
+ end
146
+
147
+ end
148
+ end
149
+ end
@@ -0,0 +1,40 @@
1
+ require 'ostruct'
2
+
3
+ module BrandEins
4
+ class CliOptionParser
5
+ def self.parse(args = ARGV)
6
+ options = OpenStruct.new
7
+ opt_parser = OptionParser.new do |opts|
8
+ opts.banner = "Usage: brandeins download --month n --year n"
9
+ opts.separator ""
10
+
11
+ opts.on('-m MONTH', '--month month', Integer, "The publication month of the magazine. E.g. for may: '5'") do |month|
12
+ options.month = month
13
+ end
14
+
15
+ opts.on('-y YEAR', '--year YEAR', Integer, "The publication year of the magazine. E.g. the current year '#{Time.now.year}'") do |year|
16
+ options.year = year
17
+ end
18
+
19
+ opts.on('--path [PATH]', 'The path where to download the magazine to. Default is the current directory.') do |path|
20
+ options.path = path
21
+ end
22
+
23
+ opts.on('-h', '--help', 'Show this message') do |help|
24
+ options.help = help
25
+ end
26
+
27
+ opts.on('-v', '--verbose', 'Be verbose') do |verbose|
28
+ options.verbose = verbose
29
+ end
30
+
31
+ opts.on('--version', 'Show the version') do |version|
32
+ options.version = version
33
+ end
34
+ end
35
+
36
+ opt_parser.parse!(args)
37
+ options
38
+ end
39
+ end
40
+ end