brandeins 0.1.6 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,112 @@
1
+ # encoding: utf-8
2
+ require 'brandeins/parser/article_site'
3
+ require 'brandeins/parser/magazine_site'
4
+ require 'brandeins/parser/archive_site'
5
+ require 'brandeins/merger/pdf_tools'
6
+ require 'net/http'
7
+ require 'prawn'
8
+
9
+ module BrandEins
10
+ class Downloader
11
+
12
+ def initialize(path, opts = {})
13
+ $BE_VERBOSE = !!opts[:verbose]
14
+
15
+ @url = 'http://www.brandeins.de'
16
+ @dl_dir = path
17
+ @tmp_dir = @dl_dir + '/brand-eins-tmp'
18
+ @pdftool = BrandEins::Merger::PDFTools.get_pdf_tool
19
+ @archive = BrandEins::Parser::ArchiveSite.new(@url)
20
+ create_tmp_dirs
21
+ end
22
+
23
+ def get_magazines_of_year(year = 2000)
24
+ puts "Getting all brand eins magazines of #{year}. This may take a while..." if $BE_VERBOSE
25
+ magazine_links_per_year = @archive.get_magazine_links_by_year(year)
26
+ magazine_links_per_year.each_with_index do |magazine_link, index|
27
+ volume = index+1
28
+ puts "Parsing Volume #{volume} of #{year}" if $BE_VERBOSE
29
+ target_pdf = pdf_filename(year, volume)
30
+ get_magazine_by_link(magazine_link, target_pdf, year, volume)
31
+ end
32
+ end
33
+
34
+ def get_magazine(year = 2000, volume = 1)
35
+ puts "Parsing Volume #{volume} of #{year}" if $BE_VERBOSE
36
+ target_pdf = pdf_filename(year, volume)
37
+ magazine_links = @archive.get_magazine_links_by_year(year)
38
+ magazine_link = magazine_links[volume-1]
39
+ get_magazine_by_link(magazine_link, target_pdf, year, volume)
40
+ end
41
+
42
+ private
43
+ def create_tmp_dirs
44
+ FileUtils.mkdir_p @tmp_dir unless File.directory?(@tmp_dir)
45
+ end
46
+
47
+ def create_cover_pdf(year, volume)
48
+ puts "Creating cover for Volume #{volume} of #{year}" if $BE_VERBOSE
49
+ cover = @archive.get_magazine_cover(year, volume)
50
+ cover_title = cover[:title]
51
+ cover_img_url = cover[:img_url]
52
+ cover_img_file = @tmp_dir + "/cover-#{year}-#{volume}.jpg"
53
+ cover_pdf_file = @tmp_dir + "/cover-#{year}-#{volume}.pdf"
54
+
55
+ puts "Downloading cover image from #{cover_img_url} to #{cover_img_file}" if $BE_VERBOSE
56
+ IO.binwrite(cover_img_file, Net::HTTP.get(URI(cover_img_url)))
57
+
58
+ puts "Creating cover pdf #{cover_pdf_file} from #{cover_img_file}" if $BE_VERBOSE
59
+ Prawn::Document.generate(cover_pdf_file) do |pdf|
60
+ pdf.text "<font size='18'><b>" + cover_title + "</b></font>", :align => :center, :inline_format => true
61
+ pdf.image cover_img_file, :position => :center, :vposition => :center
62
+ end
63
+
64
+ return cover_pdf_file
65
+ end
66
+
67
+ def get_magazine_by_link(target_magazine_link, target_pdf, year, volume)
68
+ pdf_links = @archive.magazine_pdf_links(target_magazine_link)
69
+ pdf_files = download(pdf_links)
70
+
71
+ pdf_cover = create_cover_pdf(year, volume)
72
+ pdf_files = pdf_files.reverse.push(pdf_cover).reverse
73
+
74
+ if !@pdftool.nil?
75
+ target_pdf_path = "#{@dl_dir}/#{target_pdf}"
76
+ @pdftool.merge_pdf_files(pdf_files, target_pdf_path)
77
+ cleanup
78
+ else
79
+ puts 'brandeins wont merge the single pdf files since it didnt find an appropriate pdf tool' if $BE_VERBOSE
80
+ end
81
+ end
82
+
83
+ def pdf_filename(year, volume)
84
+ "Brand-Eins-#{year}-#{volume}.pdf"
85
+ end
86
+
87
+ def download(pdf_links)
88
+ pdf_links.each_with_object([]) do |pdf_link, pdf_files|
89
+ pdf_filename = @tmp_dir + '/' + File.basename(pdf_link)
90
+ pdf_url = pdf_link
91
+ download_pdf(pdf_url, pdf_filename)
92
+ pdf_files << pdf_filename
93
+ end
94
+ end
95
+
96
+ def download_pdf(pdf_url, pdf_filename)
97
+ if File.exists? pdf_filename
98
+ puts "File #{pdf_filename} seems to be already downloaded" if $BE_VERBOSE
99
+ return true
100
+ end
101
+
102
+ puts "Downloading PDF from #{pdf_url} to #{pdf_filename}" if $BE_VERBOSE
103
+ IO.binwrite(pdf_filename, Net::HTTP.get(URI(pdf_url)))
104
+ end
105
+
106
+ def cleanup
107
+ FileUtils.rm_r @tmp_dir
108
+ end
109
+
110
+ end
111
+
112
+ end
@@ -0,0 +1,9 @@
1
+ # encoding: utf-8
2
+
3
+ class Class
4
+
5
+ def subclasses
6
+ ObjectSpace.each_object(Class).select { |klass| klass < self }
7
+ end
8
+
9
+ end
@@ -0,0 +1,47 @@
1
+ # encoding: utf-8
2
+ require 'brandeins/merger/extensions'
3
+ require 'brandeins/merger/templates/base'
4
+ require 'brandeins/merger/templates/osx'
5
+ require 'brandeins/merger/templates/pdftk_osx'
6
+ require 'brandeins/merger/templates/windows'
7
+ require 'brandeins/merger/templates/ghostscript_windows'
8
+
9
+ module BrandEins
10
+ module Merger
11
+ class PDFTools
12
+
13
+ class << self
14
+ def get_pdf_tool(env = nil)
15
+ @env = env || {}
16
+ @env[:os] ||= RUBY_PLATFORM
17
+
18
+ init_pdf_tools and return_pdf_tool
19
+ end
20
+
21
+ private
22
+ def init_pdf_tools
23
+ template_group = choose_template_group
24
+ @pdf_tools = get_subclasses template_group
25
+ end
26
+
27
+ def choose_template_group
28
+ return BrandEins::Merger::Templates::Windows if @env[:os].include? 'w32'
29
+ return BrandEins::Merger::Templates::OSX if @env[:os].include? 'darwin'
30
+ end
31
+
32
+ def get_subclasses(klass)
33
+ classes = []
34
+ klass.subclasses.each do |sklass|
35
+ classes << sklass.new
36
+ end
37
+ end
38
+
39
+ def return_pdf_tool
40
+ @pdf_tools.first.new if @pdf_tools.length > 0
41
+ end
42
+
43
+ end
44
+
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,47 @@
1
+ # encoding: utf-8
2
+
3
+ module BrandEins
4
+ module Merger
5
+ module Templates
6
+
7
+ class Base
8
+
9
+ def available?
10
+ _cmd_available? cmd, noop
11
+ end
12
+
13
+ def cmd; raise "Must be implemtented by the subclasses"; end
14
+ def noop; raise "Must be implemtented by the subclasses"; end
15
+ def args; raise "Must be implemtented by the subclasses"; end
16
+
17
+ def merge_pdf_files(pdf_files, target_pdf)
18
+ begin
19
+ pdf_files_arg = pdf_files.map {|pdf_file| "'#{pdf_file}'" }.join ' '
20
+ args = self.args.join(' ').gsub(/__pdf_files__/, pdf_files_arg).gsub(/__target_pdf__/, target_pdf)
21
+ puts "executing: #{cmd} #{args}"
22
+ _exec("#{cmd} #{args}")
23
+ rescue Exception => e
24
+ puts "error: #{e.inspect}"
25
+ return false
26
+ end
27
+ return true
28
+ end
29
+
30
+ private
31
+ def _exec (cmd)
32
+ IO.popen(cmd)
33
+ end
34
+
35
+ def _cmd_available? (cmd, args)
36
+ begin
37
+ open("|#{cmd} #{args}").close
38
+ rescue Exception
39
+ return false
40
+ end
41
+ return true
42
+ end
43
+ end
44
+
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,25 @@
1
+ # encoding: utf-8
2
+
3
+ module BrandEins
4
+ module Merger
5
+ module Templates
6
+
7
+ class GhostscriptWin < BrandEins::Merger::Templates::Windows
8
+
9
+ def cmd
10
+ 'gswin64c.exe'
11
+ end
12
+
13
+ def args
14
+ ['-dNOPAUSE', '-dBATCH', '-sDEVICE=pdfwrite', '-sOutputFile=__target_pdf__', '__pdf_files__']
15
+ end
16
+
17
+ def noop
18
+ ['--version']
19
+ end
20
+
21
+ end
22
+
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,11 @@
1
+ # encoding: utf-8
2
+
3
+ module BrandEins
4
+ module Merger
5
+ module Templates
6
+
7
+ class OSX < BrandEins::Merger::Templates::Base; end
8
+
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,25 @@
1
+ # encoding: utf-8
2
+
3
+ module BrandEins
4
+ module Merger
5
+ module Templates
6
+
7
+ class PdftkOSX < BrandEins::Merger::Templates::OSX
8
+
9
+ def cmd
10
+ 'pdftk'
11
+ end
12
+
13
+ def args
14
+ ['__pdf_files__', 'output', '__target_pdf__']
15
+ end
16
+
17
+ def noop
18
+ ['--version']
19
+ end
20
+
21
+ end
22
+
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,11 @@
1
+ # encoding: utf-8
2
+
3
+ module BrandEins
4
+ module Merger
5
+ module Templates
6
+
7
+ class Windows < BrandEins::Merger::Templates::Base; end
8
+
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,54 @@
1
+ # encoding: utf-8
2
+ require 'nokogiri'
3
+ require 'net/http'
4
+
5
+ module BrandEins
6
+ module Parser
7
+ class ArchiveSite
8
+
9
+ def initialize(base_url, opts = {})
10
+ @base_url = base_url
11
+ @archive_url = @base_url + "/archiv.html"
12
+ if html = opts[:html]
13
+ @doc = Nokogiri::HTML(html)
14
+ end
15
+ end
16
+
17
+ def doc
18
+ @doc || @doc = Nokogiri::HTML(Net::HTTP.get(URI(@archive_url)))
19
+ end
20
+
21
+ def get_magazine_links_by_year(year = 2000)
22
+ puts "Loading Magazine from year #{year}" if $BE_VERBOSE
23
+ magazine_nodes_with_meta = doc.css(".jahrgang-#{year} ul li")
24
+ magazine_nodes_with_meta.each_with_object([]) do |node, links|
25
+ if node['id'].nil? then
26
+ link = node.css('a')
27
+ if link[0].nil? then
28
+ next
29
+ end
30
+ href = link[0]['href']
31
+ links << @base_url + '/' + href
32
+ end
33
+ end
34
+ end
35
+
36
+ def get_magazine_cover(year, volume)
37
+ title = doc.css("#month_detail_#{year}_#{volume} .titel").children[0].to_s
38
+ img_url = ''
39
+ doc.css("#month_detail_#{year}_#{volume} .cover a img").each do |node|
40
+ img_url = node['src']
41
+ end
42
+ return { :title => title, :img_url => @base_url + '/' + img_url }
43
+ end
44
+
45
+ def magazine_pdf_links(url)
46
+ magazine = BrandEins::Parser::MagazineSite.new(url, @base_url)
47
+ magazine.get_magazine_pdf_links
48
+ end
49
+
50
+ end
51
+
52
+ end
53
+
54
+ end
@@ -0,0 +1,26 @@
1
+ # encoding: utf-8
2
+ require 'nokogiri'
3
+ require 'net/http'
4
+
5
+ module BrandEins
6
+ module Parser
7
+ class ArticleSite
8
+
9
+ def initialize(url)
10
+ @url = url
11
+ end
12
+
13
+ def doc
14
+ @doc || @doc = Nokogiri::HTML(Net::HTTP.get(URI(@url)))
15
+ end
16
+
17
+ def get_pdf_link
18
+ puts "Parsing Article: #{@url}" if $BE_VERBOSE
19
+ link = doc.css("div#sidebar ul li#downloaden a")
20
+ link[0] and link[0]['href']
21
+ end
22
+
23
+ end
24
+
25
+ end
26
+ end
@@ -0,0 +1,49 @@
1
+ # encoding: utf-8
2
+ require 'nokogiri'
3
+ require 'net/http'
4
+
5
+ module BrandEins
6
+ module Parser
7
+ class MagazineSite
8
+
9
+ def initialize(url, base_url)
10
+ @url, @base_url = url, base_url
11
+ end
12
+
13
+ def doc
14
+ @doc || @doc = Nokogiri::HTML(Net::HTTP.get(URI(@url)))
15
+ end
16
+
17
+ def get_magazine_pdf_links
18
+ puts "Parsing #{@url}" if $BE_VERBOSE
19
+ [get_editorial_article_links, get_schwerpunkt_article_links].flatten
20
+ end
21
+
22
+ def get_schwerpunkt_article_links
23
+ get_links("div.articleList ul h4 a")
24
+ end
25
+
26
+ def get_editorial_article_links
27
+ get_links(".editorial-links li a")
28
+ end
29
+
30
+ def get_links(css_selector)
31
+ link_nodes = doc.css(css_selector)
32
+ link_nodes.each_with_object([]) do |node, links|
33
+ article_link = @base_url + '/' + node['href']
34
+ article = BrandEins::Parser::ArticleSite.new(article_link)
35
+ pdf_link = article.get_pdf_link
36
+ if pdf_link.nil? and $BE_VERBOSE then
37
+ puts "-------------------------------"
38
+ puts "No Content for: #{article_link}"
39
+ puts "-------------------------------"
40
+ else
41
+ links << @base_url + '/' + pdf_link
42
+ end
43
+ end
44
+ end
45
+ end
46
+
47
+ end
48
+
49
+ end
@@ -1,13 +1,11 @@
1
- require File.expand_path '../pdf-tools', __FILE__
1
+ # encoding: utf-8
2
2
 
3
3
  module BrandEins
4
4
  class Setup
5
- attr_reader :pdf_tool
6
5
 
7
- def initialize(env = nil)
8
- env = Hash.new if env.nil?
6
+ def initialize(env = {})
9
7
  @os = env[:os] || RUBY_PLATFORM
10
- @pdf_tool = env[:pdf_tool] || BrandEins::PdfTools.get_pdf_tool(env)
8
+ @pdf_tool = env[:pdf_tool] || BrandEins::Merging::PdfTools.get_pdf_tool(env)
11
9
  end
12
10
 
13
11
  def run
@@ -37,4 +35,4 @@ module BrandEins
37
35
  end
38
36
 
39
37
  end
40
- end
38
+ end