RubyGems - pdf-reader-markup - Versions diffs - 0.0.1 - Mend

pdf-reader-markup 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

data/.gitignore +17 -0
data/Gemfile +4 -0
data/LICENSE.txt +22 -0
data/README.md +54 -0
data/Rakefile +30 -0
data/lib/pdf/reader/markup.rb +39 -0
data/lib/pdf/reader/markup/page_bold_italic_receiver.rb +167 -0
data/lib/pdf/reader/markup/version.rb +8 -0
data/pdf-reader-markup.gemspec +28 -0
data/spec/lib/markup_spec.rb +101 -0
data/spec/rspec_helper.rb +8 -0
data/spec/sample docs/Dorian_Gray_excerpt.pdf +0 -0
data/spec/sample docs/Lords-Forthcoming-Business.pdf +0 -0
data/spec/sample docs/canterville-ghost-excerpt.pdf +0 -0
metadata +162 -0

data/.gitignore ADDED

@@ -0,0 +1,17 @@
+*.gem
+*.rbc
+.bundle
+.config
+.yardoc
+Gemfile.lock
+InstalledFiles
+_yardoc
+coverage
+doc/
+lib/bundler/man
+pkg
+rdoc
+spec/reports
+test/tmp
+test/version_tmp
+tmp

data/Gemfile ADDED

@@ -0,0 +1,4 @@
+source 'https://rubygems.org'
+# Specify your gem's dependencies in pdf-reader-htmlpage.gemspec
+gemspec

data/LICENSE.txt ADDED

@@ -0,0 +1,22 @@
+Copyright (c) 2013 Liz Conlan
+MIT License
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.md ADDED

@@ -0,0 +1,54 @@
+# Pdf::Reader::Markup
+A markup extension for the PDF::Reader library.
+As well as continuing to support fetching a collection of lines for an
+individual page in a PDF file, this adds the method formatted_lines
+which uses HTML-style tags to mark up bold and italic text.
+## Installation
+Add this line to your application's Gemfile:
+    gem 'pdf-reader-markup'
+And then execute:
+    $ bundle
+Or install it yourself as:
+    $ gem install pdf-reader-markup
+## Usage
+Require the gem in the source file that contains the PDF-handling code:
+    require 'pdf/reader/markup'
+You should now be able to use the custom MarkupPage handler to get back
+matching plaintext and formatted lines for each page:
+    pdf = PDF::Reader.new("./spec/sample docs/Dorian_Gray_excerpt.pdf")
+    page = PDF::Reader::MarkupPage.new(pdf.pages[1])
+    # slightly modified version of the lines() method
+    lines_of_plaintext = page.lines()
+    #the new formatted_line() method
+    lines_with_markup = page.formatted_lines()
+    # and not forgetting content() which will return the all the lines as
+    # a solid block of text
+    entire_page_text = page.content()
+    # and its formatted equivalent markup
+    entired_page_markup = page.markup()
+Note that you can still access the original PDF::Reader methods within the
+same project by using `PDF::Reader::PageTextReceiver` and walking the page,
+giving access to the standard content and lines as functionality.
+You can also, if you prefer, use the
+`Reader::MarkupPage::PageBoldItalicReceiver` receiver directly rather than
+using the PDF::Reader::MarkupPage wrapper.

data/Rakefile ADDED

@@ -0,0 +1,30 @@
+require "bundler/gem_tasks"
+require 'rspec/core/rake_task'
+require 'rdoc/task'
+desc "Run tests with SimpleCov"
+task :spec do |t|
+  RSpec::Core::RakeTask.new(:cov) do |t|
+    ENV["COVERAGE"] = "1"
+  end
+end
+RSpec::Core::RakeTask.new(:spec)
+task :default => :spec
+desc "Alias for 'rake spec'"
+task :test => [:spec]
+# Generate the RDoc documentation
+desc "Create documentation"
+Rake::RDocTask.new("doc") do |rdoc|
+  rdoc.title = "pdf-reader-markup"
+  rdoc.rdoc_dir = "doc"
+  rdoc.rdoc_files.include('README.md')
+  rdoc.main = 'README.md'
+  rdoc.rdoc_files.include('LICENSE.txt')
+  rdoc.rdoc_files.include('lib/pdf/reader/*.rb')
+  rdoc.rdoc_files.include('lib/pdf/reader/markup/*.rb')
+  rdoc.options << "--main"
+end

data/lib/pdf/reader/markup.rb ADDED

@@ -0,0 +1,39 @@
+#encoding: utf-8
+require "pdf/reader/markup/version"
+require "pdf/reader/markup/page_bold_italic_receiver.rb"
+module PDF #:nodoc:all:
+  class Reader #:nodoc:
+    class MarkupPage #:doc:
+      ##
+      # Returns the plaintext content of the page
+      attr_reader :content
+      ##
+      # Returns the formatted lines for the page
+      # as an array
+      attr_reader :formatted_lines
+      ##
+      # Returns the plaintext lines for the page
+      # as an array
+      attr_reader :lines
+      ##
+      # Returns the formatted content of the page
+      attr_reader :markup
+      ##
+      # Wrapper function for walking the page with the
+      # Reader::MarkupPage::PageBoldItalicReceiver receiver
+      def initialize(page)
+        receiver = PageBoldItalicReceiver.new()
+        page.walk(receiver)
+        @content = receiver.content
+        @markup = receiver.markup
+        @lines = @content.lines.to_a
+        @formatted_lines = @markup.lines.to_a
+      end
+    end
+  end
+end

data/lib/pdf/reader/markup/page_bold_italic_receiver.rb ADDED

@@ -0,0 +1,167 @@
+#encoding: utf-8
+require "pdf/reader"
+require "nokogiri"
+module PDF #:nodoc:all:
+  class Reader #:nodoc:
+    class MarkupPage #:doc:
+      ##
+      # Builds a UTF-8 plaintext string and a UTF-8 string that includes
+      # simple Bold and Italic markup of all the text on a single page by
+      # processing all the operators in a content stream.
+      class PageBoldItalicReceiver < PDF::Reader::PageTextReceiver
+        ##
+        # starting a new page
+        def page=(page)
+          super(page)
+          @last_tag_end = ""
+          @open_tag = ""
+          @lasty = 0.0
+          @footer = []
+          @text = []
+          @lines = []
+        end
+        ##
+        # Returns the value of the markup attribute - equivalent to the
+        # content attribute but with bold and italic markup
+        def markup
+          unless @text.empty?
+            line = fix_markup("#{@text.join("").strip}#{@last_tag_end}")
+            @lines << line
+            @text = []
+          end
+          if @footer.join("").strip.empty?
+            if @lines.last.empty?
+              output = @lines[0..-2].join("\n")
+            else
+              output = @lines.join("\n")
+            end
+          else
+            output = %Q|#{@lines.join("\n")}\n#{@footer.join("")}|
+          end
+          output
+        end
+        ##
+        # Returns the value of the content attribute
+        def content
+          lines = super.lines.to_a
+          fixed = []
+          current_line = 0
+          offset = 0
+          formatted_lines = markup.lines.to_a
+          lines.each_with_index do |line, index|
+            formatted_line = formatted_lines[index + offset]
+            if line.strip == "" and (formatted_line and formatted_lines[index + offset].strip != "")
+              offset -= 1
+            else
+              fixed << line
+            end
+          end
+          lines = fixed.join("")
+          lines
+        end
+        private
+        def fix_markup(string)
+          #get Nokogiri to close any open tags
+          string = Nokogiri::HTML::fragment(string).to_html
+          #strip empty markup tags
+          while string =~ /<(?:b|i)>\s*<\/(?:b|i)>/
+            string = string.gsub(/<(?:b|i)>\s*<\/(?:b|i)>/, "").strip
+          end
+          string
+        end
+        def font_type(font, type)
+          if font.basefont.to_s.include?(type)
+            return true
+          end
+          false
+        end
+        def markup_tags(font)
+          open = ""
+          close = ""
+          if font_type(@state.current_font, "Bold")
+            open = "<b>"
+            close = "</b>"
+          end
+          if font_type(@state.current_font, "Italic")
+            open = "#{open}<i>"
+            close = "</i>#{close}"
+          end
+          {:open => open, :close => close}
+        end
+        def append_line(tags, run)
+          line = fix_markup("#{@text.join("").strip}#{@last_tag_end}")
+          unless @lines.empty? and line.strip.empty?
+            @lines << line
+          end
+          @last_tag_end = ""
+          @text = ["#{tags[:open]}#{run.to_s}"]
+        end
+        def internal_show_text(string)
+          if @state.current_font.nil?
+            raise PDF::Reader::MalformedPDFError, "current font is invalid"
+          end
+          glyphs = @state.current_font.unpack(string)
+          text = ""
+          glyphs.each_with_index do |glyph_code, index|
+            # paint the current glyph
+            newx, newy = @state.trm_transform(0,0)
+            utf8_chars = @state.current_font.to_utf8(glyph_code)
+            # apply to glyph displacment for the current glyph so the next
+            # glyph will appear in the correct position
+            glyph_width = @state.current_font.glyph_width(glyph_code) / 1000.0
+            th = 1
+            scaled_glyph_width = glyph_width * @state.font_size * th
+            run = TextRun.new(newx, newy, scaled_glyph_width, @state.font_size, utf8_chars)
+            @characters << run
+            @state.process_glyph_displacement(glyph_width, 0, utf8_chars == SPACE)
+            build_markup(newy, run)
+          end
+        end
+        def build_markup(newy, run)
+          tags = markup_tags(@state.current_font)
+          if tags[:open] == @open_tag
+            if newy < 50
+              @footer << run.to_s
+              newy = @lasty
+            else
+              if newy < @lasty
+                append_line(tags, run)
+              else
+                @text << "#{run.to_s}"
+              end
+            end
+          else
+            if newy < 50
+              @footer << "#{@last_tag_end}#{run.to_s}"
+              newy = @lasty
+            else
+              if newy < @lasty
+                append_line(tags, run)
+              else
+                @text << "#{@last_tag_end}#{tags[:open]}#{run.to_s}"
+              end
+            end
+            @last_tag_end = tags[:close]
+          end
+          @open_tag = tags[:open]
+          @lasty = newy
+        end
+      end
+    end
+  end
+end

data/lib/pdf/reader/markup/version.rb ADDED

@@ -0,0 +1,8 @@
+#:stopdoc:
+module PDF
+  class Reader
+    class MarkupPage
+      VERSION = "0.0.1"
+    end
+  end
+end

data/pdf-reader-markup.gemspec ADDED

@@ -0,0 +1,28 @@
+# coding: utf-8
+lib = File.expand_path('../lib', __FILE__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+require 'pdf/reader/markup/version'
+Gem::Specification.new do |spec|
+  spec.name          = "pdf-reader-markup"
+  spec.version       = PDF::Reader::MarkupPage::VERSION
+  spec.authors       = ["Liz Conlan"]
+  spec.email         = ["lizconlan@gmail.com"]
+  spec.description   = %q{A markup extension for the PDF::Reader library}
+  spec.summary       = %q{Adds the option to retrieve text lines marked up with bold and italic tags when parsing PDF pages with PDF::Reader}
+  spec.homepage      = "https://github.com/lizconlan/pdf-reader-markup"
+  spec.license       = "MIT"
+  spec.files         = `git ls-files`.split($/)
+  spec.executables   = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
+  spec.test_files    = spec.files.grep(%r{^(test|spec|features)/})
+  spec.require_paths = ["lib"]
+  spec.add_dependency "pdf-reader", "~> 1.3"
+  spec.add_dependency "nokogiri"
+  spec.add_development_dependency "bundler", "~> 1.3"
+  spec.add_development_dependency "rake"
+  spec.add_development_dependency "rspec"
+  spec.add_development_dependency "simplecov"
+end

data/spec/lib/markup_spec.rb ADDED

@@ -0,0 +1,101 @@
+#encoding: utf-8
+require_relative "../rspec_helper.rb"
+require './lib/pdf/reader/markup.rb'
+describe "MarkupPage" do
+  context "when given an excerpt from The Canterville Ghost" do
+    before(:all) do
+      @pdf = PDF::Reader.new("./spec/sample docs/canterville-ghost-excerpt.pdf")
+    end
+    context "reading the first page" do
+      before(:all) do
+        @pdf_page = PDF::Reader::MarkupPage.new(@pdf.pages[0])
+      end
+      it "should find 8 lines of text" do
+        @pdf_page.lines.count.should eq 8
+        @pdf_page.formatted_lines.count.should eq 8
+      end
+      it "should find a mixture of plain and italic text" do
+        @pdf_page.formatted_lines[0].should eq "The Canterville Ghost\n"
+        @pdf_page.formatted_lines[1].should eq "<i>An amusing chronicle of the tribulations of the Ghost of Canterville Chase when his ancestral halls became the</i>\n"
+        @pdf_page.formatted_lines[2].should eq "<i>home of the American Minister to the Court of St. James</i>\n"
+      end
+    end
+    context "reading the second page" do
+      before(:all) do
+        @pdf_page = PDF::Reader::MarkupPage.new(@pdf.pages[1])
+      end
+      it "should find 16 lines of text" do
+        @pdf_page.lines.count.should eq 16
+        @pdf_page.formatted_lines.count.should eq 16
+      end
+      it "should correctly relocate the footer to the end of the text block" do
+        @pdf_page.lines.last.should eq "The Pennsylvania State University is an equal opportunity university."
+      end
+    end
+  end
+  context "when given an excerpt from The Picture of Dorian Gray" do
+    before(:all) do
+      @pdf = PDF::Reader.new("./spec/sample docs/Dorian_Gray_excerpt.pdf")
+    end
+    context "reading the second page" do
+      before(:all) do
+        @pdf_page = PDF::Reader::MarkupPage.new(@pdf.pages[1])
+      end
+      it "should find 24 lines of text" do
+        @pdf_page.lines.count.should eq 24
+        @pdf_page.formatted_lines.count.should eq 24
+      end
+      it "should find a mixture of plain, bold and italic text" do
+        @pdf_page.formatted_lines[0].should eq "<i>The Picture of Dorian Gray</i>\n"
+        @pdf_page.formatted_lines[1].should eq "<b>Chapter I</b>\n"
+        @pdf_page.formatted_lines[2].should eq "The studio was filled with the rich odor of roses, and\n"
+        @pdf_page.formatted_lines[3].should eq "when the light summer wind stirred amidst the trees of the\n"
+        @pdf_page.formatted_lines[4].should eq "garden there came through the open door the heavy scent\n"
+        @pdf_page.formatted_lines[5].should eq "of the lilac, or the more delicate perfume of the pink-\n"
+        @pdf_page.formatted_lines[6].should eq "flowering thorn.\n"
+      end
+    end
+  end
+  context "when given a House of Lords Forthcoming Business document" do
+    before(:all) do
+      @pdf = PDF::Reader.new("./spec/sample docs/Lords-Forthcoming-Business.pdf")
+    end
+    context "reading the first page" do
+      before(:all) do
+        @pdf_page = PDF::Reader::MarkupPage.new(@pdf.pages[0])
+      end
+      it "should find 32 lines of text" do
+        @pdf_page.lines.count.should eq 32
+        @pdf_page.formatted_lines.count.should eq 32
+      end
+      it "should find a mixture of plain, bold and italic text" do
+        @pdf_page.lines[0].should eq "                   GOVERNMENT WHIPS’ OFFICE\n"
+        @pdf_page.formatted_lines[0].should eq "GOVERNMENT WHIPS’ OFFICE\n"
+        @pdf_page.lines[3].should eq "                     FORTHCOMING BUSINESS\n"
+        @pdf_page.formatted_lines[3].should eq "<b>FORTHCOMING BUSINESS</b>\n"
+        @pdf_page.lines[6].should eq "                        [Notes about this document are set out at the end]\n"
+        @pdf_page.formatted_lines[6].should eq "[<i>Notes about this document are set out at the end</i>]\n"
+        @pdf_page.formatted_lines[29].should eq "<b><i>Easter adjournment:</i></b>\n"
+      end
+    end
+  end
+end

data/spec/rspec_helper.rb ADDED

@@ -0,0 +1,8 @@
+if ENV['COVERAGE']
+  require 'simplecov'
+  SimpleCov.start do
+    add_filter 'spec'
+  end
+end
+require 'rspec/autorun'

data/spec/sample docs/Dorian_Gray_excerpt.pdf ADDED

Binary file

data/spec/sample docs/Lords-Forthcoming-Business.pdf ADDED

Binary file

data/spec/sample docs/canterville-ghost-excerpt.pdf ADDED

Binary file

metadata ADDED

@@ -0,0 +1,162 @@
+--- !ruby/object:Gem::Specification
+name: pdf-reader-markup
+version: !ruby/object:Gem::Version
+  version: 0.0.1
+  prerelease:
+platform: ruby
+authors:
+- Liz Conlan
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2013-11-17 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: pdf-reader
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '1.3'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '1.3'
+- !ruby/object:Gem::Dependency
+  name: nokogiri
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: bundler
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '1.3'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '1.3'
+- !ruby/object:Gem::Dependency
+  name: rake
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: rspec
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: simplecov
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+description: A markup extension for the PDF::Reader library
+email:
+- lizconlan@gmail.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- .gitignore
+- Gemfile
+- LICENSE.txt
+- README.md
+- Rakefile
+- lib/pdf/reader/markup.rb
+- lib/pdf/reader/markup/page_bold_italic_receiver.rb
+- lib/pdf/reader/markup/version.rb
+- pdf-reader-markup.gemspec
+- spec/lib/markup_spec.rb
+- spec/rspec_helper.rb
+- spec/sample docs/Dorian_Gray_excerpt.pdf
+- spec/sample docs/Lords-Forthcoming-Business.pdf
+- spec/sample docs/canterville-ghost-excerpt.pdf
+homepage: https://github.com/lizconlan/pdf-reader-markup
+licenses:
+- MIT
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 1.8.24
+signing_key:
+specification_version: 3
+summary: Adds the option to retrieve text lines marked up with bold and italic tags
+  when parsing PDF pages with PDF::Reader
+test_files:
+- spec/lib/markup_spec.rb
+- spec/rspec_helper.rb
+- spec/sample docs/Dorian_Gray_excerpt.pdf
+- spec/sample docs/Lords-Forthcoming-Business.pdf
+- spec/sample docs/canterville-ghost-excerpt.pdf