RubyGems - brandeins - Versions diffs - 0.2.2 → 0.3.0.pre - Mend

brandeins 0.2.2 → 0.3.0.pre

Files changed (59) hide show

checksums.yaml +7 -0
data/.gitignore +5 -1
data/.rspec +2 -0
data/.rubocop.yml +5 -0
data/.ruby-version +1 -0
data/.travis.yml +11 -0
data/Gemfile +7 -4
data/Gemfile.lock +47 -21
data/NOTES.md +6 -0
data/Rakefile +15 -8
data/bin/brandeins +3 -1
data/brandeins.gemspec +0 -1
data/lib/brandeins.rb +3 -5
data/lib/brandeins/cli.rb +46 -34
data/lib/brandeins/config.rb +18 -0
data/lib/brandeins/kiosk.rb +100 -0
data/lib/brandeins/merger/external/base.rb +16 -6
data/lib/brandeins/merger/pdf_tools.rb +3 -6
data/lib/brandeins/pages/archive.rb +91 -0
data/lib/brandeins/pages/article.rb +37 -0
data/lib/brandeins/pages/cover.rb +67 -0
data/lib/brandeins/pages/magazine.rb +149 -0
data/lib/brandeins/utils/cli_option_parser.rb +40 -0
data/lib/brandeins/utils/cli_output.rb +100 -0
data/lib/brandeins/utils/fetcher.rb +115 -0
data/lib/brandeins/utils/merger.rb +41 -0
data/lib/brandeins/version.rb +1 -1
data/rubocop-todo.yml +141 -0
data/spec/lib/brandeins/kiosk_spec.rb +66 -0
data/spec/lib/brandeins/pages/archive_spec.rb +40 -0
data/spec/lib/brandeins/pages/article_spec.rb +23 -0
data/spec/lib/brandeins/pages/magazine_spec.rb +91 -0
data/spec/lib/brandeins/utils/fetcher_spec.rb +8 -0
data/spec/lib/brandeins_spec.rb +19 -0
data/spec/spec_helper.rb +23 -0
data/spec/support/capture_stdout.rb +12 -0
data/spec/support/fixtures/archive.html +2365 -0
data/spec/support/fixtures/artikel-masskonfektion-aus-plastik.html +254 -0
data/spec/support/fixtures/artikel-schauspieler-daenemark.html +247 -0
data/{test_support → spec/support}/fixtures/cover.jpg +0 -0
data/spec/support/fixtures/editorial.html +236 -0
data/spec/support/fixtures/just-a.pdf +0 -0
data/spec/support/fixtures/magazine-1-2013.html +242 -0
data/spec/support/fixtures/magazine-cover-fallback.html +1610 -0
data/spec/support/fixtures/magazine-with-cover.html +1416 -0
metadata +68 -61
data/.rvmrc +0 -48
data/lib/brandeins/downloader.rb +0 -111
data/lib/brandeins/errors.rb +0 -5
data/lib/brandeins/parser/archive_site.rb +0 -54
data/lib/brandeins/parser/article_site.rb +0 -26
data/lib/brandeins/parser/magazine_site.rb +0 -49
data/lib/brandeins/setup.rb +0 -38
data/specs/brandeins_spec.rb +0 -52
data/specs/spec_helper.rb +0 -1
data/test/brandeins_test.rb +0 -65
data/test/helper.rb +0 -1
data/test_support/capture_stdout.rb +0 -12
data/test_support/fixtures/brandeins_archiv.html +0 -50

data/lib/brandeins/merger/external/base.rb CHANGED

@@ -1,5 +1,9 @@
 # encoding: utf-8
+require 'shellwords'
+require_relative '../../utils/cli_output'
 module BrandEins
   module Merger
     module External
@@ -15,19 +19,25 @@ module BrandEins
         def args; raise "Must be implemtented by the subclasses"; end
         def merge_pdf_files(pdf_files, target_pdf)
+          # TODO: This is terrible. Use shellwords.shellescape!
           begin
-            pdf_files_arg = pdf_files.map {|pdf_file| "'#{pdf_file}'" }.join ' '
-            args = self.args.join(' ').gsub(/__pdf_files__/, pdf_files_arg).gsub(/__target_pdf__/, target_pdf)
-            puts "executing: #{cmd} #{args}"
-            _exec("#{cmd} #{args}")
+            pdf_files_arg = pdf_files.map {|pdf_file| "'#{pdf_file.to_s}'" }.join ' '
+            args = self.args.join(' ').gsub(/__pdf_files__/, pdf_files_arg).gsub(/__target_pdf__/, "'#{target_pdf.to_s}'")
+            cli.info "Running PDF Merger for #{target_pdf}"
+            cli.debug "Executing: `#{cmd} #{args}`" do
+              _exec("#{cmd} #{args}")
+            end
           rescue Exception => e
-            puts "error: #{e.inspect}"
+            cli.error "Error when merging file: #{e.inspect}"
             return false
           end
           return true
         end
-        private
+        def cli
+          @cli ||= BrandEins::Utils::CliOutput.instance
+        end
         def _exec (cmd)
           IO.popen(cmd) do |io|
             io.each do |line|

data/lib/brandeins/merger/pdf_tools.rb CHANGED

@@ -1,7 +1,7 @@
 # encoding: utf-8
-require 'brandeins/merger/external/base'
-require 'brandeins/merger/external/pdftk'
-require 'brandeins/merger/external/ghostscript_windows'
+require_relative 'external/base'
+require_relative 'external/pdftk'
+require_relative 'external/ghostscript_windows'
 module BrandEins
   module Merger
@@ -14,8 +14,6 @@ module BrandEins
           get_klass_for_external(env).new
         end
-        private
         def get_klass_for_external(env)
           if env[:os].include? 'w32'
             BrandEins::Merger::External::GhostscriptWindows
@@ -23,7 +21,6 @@ module BrandEins
             BrandEins::Merger::External::Pdftk
           end
         end
       end
     end

data/lib/brandeins/pages/archive.rb ADDED

@@ -0,0 +1,91 @@
+# encoding: utf-8
+require 'nokogiri'
+require 'english'
+require_relative '../config'
+require_relative '../pages/magazine'
+module BrandEins
+  module Pages
+    # Usage of +Archive+
+    #
+    # page = Archive.new(html)
+    # page.magazines_for_year(2000)
+    # => [Magazine, Magazine, ...]
+    #
+    # page.magazine_for(month: 1, year: 2000)
+    # => Magazine
+    #
+    # page.magazine_for(month: 13, year: 9999)
+    # => nil
+    #
+    class Archive
+      attr_reader :html
+      def initialize(opts = {})
+        @html = opts.delete(:html)
+        @magazines = {}
+      end
+      def html
+        cli.info "Loading the archive" do
+          @html ||= fetcher.fetch(archive_url)
+        end
+      rescue BrandEins::Utils::Fetcher::ContentNotFetchedError => e
+        raise e, "Could not download the archiv.html (May be the URL changed?)\n=> Original error: #{e.message}", e.backtrace
+      end
+      def magazines_for_year(year)
+        @magazines[year] ||= parse_magazines_for_year(year)
+      end
+      def magazine_for(month: nil, year: nil)
+        magazines_for_year(year)[month]
+      end
+      private
+      def document
+        @document ||= Nokogiri::HTML(html)
+      end
+      def parse_magazines_for_year(year)
+        anchor = document.css("h3#anchor-#{year}").first
+        root   = anchor.xpath('../../..')
+        root.css('article figure').each_with_object({}) do |figure, magazines|
+          magazine_url   = extract_magazine_url(figure)
+          magazine_month = extract_magazine_month(figure)
+          magazine = BrandEins::Pages::Magazine.new(url: magazine_url)
+          magazines[magazine_month] = magazine
+        end
+      end
+      def extract_magazine_url(figure)
+        brandeins_url + '/' + figure.css('a.read.more').first['href']
+      end
+      def extract_magazine_month(figure)
+        meta = figure.css('.meta').first
+        meta.text.match(/(?:.+)(\d{2})\/(?:.+)/) && $LAST_PAREN_MATCH.to_i
+      end
+      def brandeins_url
+        BrandEins::Config['base_uri']
+      end
+      def archive_url
+        BrandEins::Config['archive_uri']
+      end
+      def fetcher
+        @fetcher ||= BrandEins::Utils::Fetcher.instance
+      end
+      def cli
+        @cli ||= BrandEins::Utils::CliOutput.instance
+      end
+    end
+  end
+end

data/lib/brandeins/pages/article.rb ADDED

@@ -0,0 +1,37 @@
+# encoding: utf-8
+require 'nokogiri'
+require_relative '../config'
+module BrandEins
+  module Pages
+    class Article
+      def initialize(html)
+        @html = html
+      end
+      def pdf_url
+        if node = document.css('a[href$=pdf]').first
+          brandeins_url + '/' + node['href']
+        end
+      end
+      def title
+        if node = document.css('h2.csc-firstHeader').first
+          node.children.first.text.gsub("\n", '')
+        end
+      end
+      def document
+        @document ||= Nokogiri::HTML(@html)
+      end
+      def brandeins_url
+        BrandEins::Config['base_uri']
+      end
+    end
+  end
+end

data/lib/brandeins/pages/cover.rb ADDED

@@ -0,0 +1,67 @@
+# encoding: utf-8
+require 'tempfile'
+require 'prawn'
+require_relative '../utils/fetcher'
+module BrandEins
+  module Pages
+    class Cover
+      def initialize(magazine)
+        @magazine = magazine
+      end
+      def cover_image_url
+        @magazine.cover_url
+      end
+      def cover_title
+        @magazine.title
+      end
+      def to_pdf
+        cover_image = download_cover_image
+        cover_pdf   = create_cover_pdf(cover_image)
+      end
+      def download_cover_image
+        fetcher.fetch(cover_image_url)
+      end
+      def create_cover_pdf(image)
+        Prawn::Document.new do |pdf|
+          pdf.text '<font size="18"><b>' + cover_title + '</b></font>',
+                   align: :center,
+                   inline_format: true
+          if image
+            # TODO: get Null Byte?
+            # pdf.image image, position: :center, vposition: :center
+          end
+        end.render
+      end
+      def save_to(path)
+        cover_file_path = cover_file_path_for_path(path)
+        return cover_file_path if File.exists? cover_file_path
+        File.binwrite(cover_file_path, to_pdf)
+        cover_file_path
+      rescue BrandEins::Utils::Fetcher::ContentNotFetchedError => e
+      end
+      def cover_file_path_for_path(path)
+        Pathname.new(path) + file_name
+      end
+      def file_name
+        "magazine-cover-#{@magazine.month}-#{@magazine.year}.pdf"
+      end
+      def fetcher
+        @fetcher ||= BrandEins::Utils::Fetcher.instance
+      end
+    end
+  end
+end

data/lib/brandeins/pages/magazine.rb ADDED

@@ -0,0 +1,149 @@
+# encoding: utf-8
+require 'english'
+require 'nokogiri'
+require_relative '../config'
+require_relative '../utils/fetcher'
+require_relative '../utils/cli_output'
+require_relative '../pages/article'
+module BrandEins
+  module Pages
+    # Usage of +MagazinePage+
+    #
+    # page = BrandEins::Parser::MagazinePage.new(html)
+    # page.article_pdf_urls
+    # => ['http://example.com/archive/article1.pdf',
+    #     'http://example.com/archive/article2.pdf',
+    #     ...
+    #    ]
+    #
+    # page.cover_url
+    # => 'http://example.com/archive/cover1.png'
+    #
+    class Magazine
+      def initialize(opts = {})
+        if opts.is_a? String
+          opts = { html: opts }
+        end
+        @html = opts[:html]
+        @url  = opts[:url]
+      end
+      def html
+        @html ||= fetcher.fetch(url)
+      end
+      def article_urls
+        @article_urls ||= parse_article_urls
+      end
+      def article_pdf_urls
+        @article_pdf_urls ||= article_urls.map do |article_url|
+          article_html = fetcher.fetch(article_url)
+          article = BrandEins::Pages::Article.new(article_html)
+          article.pdf_url or cli.info "No PDF for: \"#{article.title}\""
+        end.compact
+      end
+      def cover_url
+        @cover_url ||= parse_cover_image_url
+      end
+      def title
+        @title ||= document.css('.current-issue h2').children.first.text
+      end
+      def year
+        @year ||= parse_year
+      end
+      def month
+        @month ||= parse_month
+      end
+      def url
+        @url ||= parse_url
+      end
+      def document
+        @document ||= Nokogiri::HTML(html)
+      end
+      def parse_article_urls
+        document.css('.ihv_list > a').each_with_object([]) do |node, links|
+          links << brandeins_url + '/' + node['href']
+        end
+      end
+      def parse_cover_image_url
+        img_tag = primary_cover_image || secondary_cover_image
+        brandeins_url + '/' + img_tag.attributes['src'].text if img_tag
+      end
+      def secondary_cover_image
+        document.css('.preparedTeaserImage img').first
+      end
+      def primary_cover_image
+        document.css('.coverImage img').first
+      end
+      def parse_year
+        if issue_text.match /Ausgabe (?:.+)\/(.+)/
+          $LAST_PAREN_MATCH.to_i
+        end
+      end
+      def parse_month
+        if issue_text.match /Ausgabe (.+)\/(?:.+)/
+          $LAST_PAREN_MATCH.to_i
+        end
+      end
+      def issue_text
+        node = document.css('.current-issue h3').last
+        text = node.children.first.text
+      end
+      def parse_url
+        document.css('[property="og:url"]').first.attributes['content'].value
+      end
+      def brandeins_url
+        BrandEins::Config['base_uri']
+      end
+      def fetcher
+        @fetcher ||= BrandEins::Utils::Fetcher.instance
+      end
+      def save_articles_to(path)
+        article_pdf_urls.each_with_object([]) do |pdf_url, pdf_files|
+          pdf = fetcher.fetch(pdf_url)
+          file_path = file_path_for_pdf(path, pdf_url)
+          File.binwrite(file_path, pdf)
+          pdf_files << file_path
+        end
+      end
+      def file_path_for_pdf(path, pdf_url)
+        target_path = Pathname.new(path)
+        target_path.mkpath
+        file_path = target_path + file_name_for_pdf_url(pdf_url)
+      end
+      def file_name_for_pdf_url(pdf_url)
+        uri_path  = URI(pdf_url).path
+        file_name = File.basename(uri_path)
+      end
+      def cli
+        @cli ||= BrandEins::Utils::CliOutput.instance
+      end
+    end
+  end
+end

data/lib/brandeins/utils/cli_option_parser.rb ADDED

@@ -0,0 +1,40 @@
+require 'ostruct'
+module BrandEins
+  class CliOptionParser
+    def self.parse(args = ARGV)
+      options = OpenStruct.new
+      opt_parser = OptionParser.new do |opts|
+        opts.banner = "Usage: brandeins download --month n --year n"
+        opts.separator ""
+        opts.on('-m MONTH', '--month month', Integer, "The publication month of the magazine. E.g. for may: '5'") do |month|
+          options.month = month
+        end
+        opts.on('-y YEAR', '--year YEAR', Integer, "The publication year of the magazine. E.g. the current year '#{Time.now.year}'") do |year|
+          options.year = year
+        end
+        opts.on('--path [PATH]', 'The path where to download the magazine to. Default is the current directory.') do |path|
+          options.path = path
+        end
+        opts.on('-h', '--help', 'Show this message') do |help|
+          options.help = help
+        end
+        opts.on('-v', '--verbose', 'Be verbose') do |verbose|
+          options.verbose = verbose
+        end
+        opts.on('--version', 'Show the version') do |version|
+          options.version = version
+        end
+      end
+      opt_parser.parse!(args)
+      options
+    end
+  end
+end