RubyGems - pdf-struct - Versions diffs - 1.0.0 - Mend

pdf-struct 1.0.0

Files changed (2) hide show

data/lib/pdf/extractor.rb +127 -0
metadata +46 -0

@@ -0,0 +1,127 @@
+require 'rexml/document'
+require 'rexml/streamlistener'
+module PDF
+	module Extractor
+		class ConversionError   < RuntimeError; end
+		class MalformedPDFError < RuntimeError; end
+		def self.open(path)
+			input = `pdftohtml -enc UTF-8 -xml -stdout #{path} 2>&1`
+			case input
+			#when /command not found/
+			#	raise ConversionError, 'pdftohtml command not found'
+			when /PDF file is damaged/
+				raise MalformedPDFError, "the PDF with filename '#{path}' is malformed"
+			when /Couldn't open file/
+				raise RuntimeError, "Couldn't open file: '#{path}'"
+			else
+				PDF::Extractor::Document.new(input)
+			end
+		end
+	end
+end
+class PDF::Extractor::Element
+	attr_reader   :top, :left, :width, :height, :font
+	attr_accessor :content
+	def initialize(params = {})
+		@top     = params[:top]
+		@left    = params[:left]
+		@width   = params[:width]
+		@height  = params[:height]
+		@font    = params[:font]
+		@content = params[:content]
+	end
+end
+class PDF::Extractor::Font
+	attr_reader   :id, :name, :size
+	attr_accessor :style
+	def initialize(params = {})
+		@id    = params[:id]
+		@size  = params[:size].to_f
+		@name  = params[:name]
+		@style = :normal
+	end
+	def normal?; @style == :normal end
+	def bold?;   @style == :bold   end
+	def italic?; @style == :italic end
+end
+class PDF::Extractor::Page
+	attr_reader :elements, :width, :height
+	def initialize(params = {})
+		@width    = params[:width]
+		@height   = params[:height]
+		@elements = []
+	end
+end
+class PDF::Extractor::Reader
+	include REXML::StreamListener
+	attr_reader :pages, :fonts
+	def initialize
+		@pages, @fonts = [], []
+	end
+	def tag_start(name, attributes)
+		@in_text = false
+		case name
+		when 'page'
+			@pages << PDF::Extractor::Page.new(
+				:width => attributes['width'].to_f,
+				:height => attributes['height'].to_f
+			)
+		when 'fontspec'
+			@fonts << PDF::Extractor::Font.new(
+				:id => attributes['id'],
+				:size => attributes['size'].to_f + 2, # is this right?
+				:name => attributes['family']
+			)
+		when 'text'
+			@in_text = true
+			@pages.last.elements << PDF::Extractor::Element.new(
+				:top => attributes['top'].to_f,
+				:left => attributes['left'].to_f,
+				:width => attributes['width'].to_f,
+				:height => attributes['height'].to_f,
+				:font => @fonts.find{|n| n.id == attributes['font']}
+			)
+		when 'b'
+			@in_text = true
+			@pages.last.elements.last.font.style = :bold
+		when 'i'
+			@in_text = true
+			@pages.last.elements.last.font.style = :italic
+		end
+	end
+	def text(str)
+		@pages.last.elements.last.content = str if @in_text and str =~ /\S/
+	end
+end
+class PDF::Extractor::Document
+	attr_reader :pages
+	def initialize(source)
+		populate source
+	end
+	def elements; @pages.map{|n| n.elements}.flatten end
+private
+	def populate(source)
+		listener = PDF::Extractor::Reader.new
+		REXML::Parsers::StreamParser.new(source, listener).parse
+		@pages, @fonts = listener.pages, listener.fonts
+	end
+end

metadata ADDED

@@ -0,0 +1,46 @@
+--- !ruby/object:Gem::Specification
+name: pdf-struct
+version: !ruby/object:Gem::Version
+  version: 1.0.0
+  prerelease:
+platform: ruby
+authors:
+- Erik Terpstra
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2013-02-10 00:00:00.000000000 Z
+dependencies: []
+description: PDF::Extractor is a library that provides high level access to the text
+  objects of a PDF document.
+email: unknown@email.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- lib/pdf/extractor.rb
+homepage: http://rubygems.org/gems/pdf-struct
+licenses: []
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 1.8.16
+signing_key:
+specification_version: 3
+summary: PDF::Extractor
+test_files: []