RubyGems - rpdf2txt - Versions diffs - 0.8.2 - Mend

rpdf2txt 0.8.2

Files changed (127) hide show

data/History.txt +5 -0
data/LICENCE +515 -0
data/Manifest.txt +126 -0
data/README.txt +30 -0
data/Rakefile +24 -0
data/bin/rpdf2txt +58 -0
data/config.save +12 -0
data/install.rb +1098 -0
data/lib/rpdf2txt-rockit/base_extensions.rb +73 -0
data/lib/rpdf2txt-rockit/bootstrap.rb +120 -0
data/lib/rpdf2txt-rockit/bounded_lru_cache.rb +43 -0
data/lib/rpdf2txt-rockit/conflict_resolution.rb +302 -0
data/lib/rpdf2txt-rockit/directed_graph.rb +401 -0
data/lib/rpdf2txt-rockit/glr_parser.rb +393 -0
data/lib/rpdf2txt-rockit/grammar.rb +644 -0
data/lib/rpdf2txt-rockit/graphdrawing.rb +107 -0
data/lib/rpdf2txt-rockit/graphviz_dot.rb +63 -0
data/lib/rpdf2txt-rockit/indexable.rb +53 -0
data/lib/rpdf2txt-rockit/lalr_parsetable_generator.rb +144 -0
data/lib/rpdf2txt-rockit/parse_table.rb +273 -0
data/lib/rpdf2txt-rockit/parsetable_generation.rb +164 -0
data/lib/rpdf2txt-rockit/parsing_ambiguities.rb +84 -0
data/lib/rpdf2txt-rockit/profiler.rb +168 -0
data/lib/rpdf2txt-rockit/reduce_actions_generator.rb +523 -0
data/lib/rpdf2txt-rockit/rockit.rb +76 -0
data/lib/rpdf2txt-rockit/rockit_grammar_ast_eval.rb +187 -0
data/lib/rpdf2txt-rockit/rockit_grammars_parser.rb +126 -0
data/lib/rpdf2txt-rockit/sourcecode_dumpable.rb +181 -0
data/lib/rpdf2txt-rockit/stringscanner.rb +54 -0
data/lib/rpdf2txt-rockit/syntax_tree.rb +452 -0
data/lib/rpdf2txt-rockit/token.rb +364 -0
data/lib/rpdf2txt-rockit/version.rb +3 -0
data/lib/rpdf2txt/attributesparser.rb +42 -0
data/lib/rpdf2txt/cmapparser.rb +65 -0
data/lib/rpdf2txt/data/_cmap.grammar +11 -0
data/lib/rpdf2txt/data/_cmap_range.grammar +15 -0
data/lib/rpdf2txt/data/_pdfattributes.grammar +32 -0
data/lib/rpdf2txt/data/cmap.grammar +11 -0
data/lib/rpdf2txt/data/cmap.rb +37 -0
data/lib/rpdf2txt/data/cmap_range.grammar +15 -0
data/lib/rpdf2txt/data/cmap_range.rb +43 -0
data/lib/rpdf2txt/data/fonts/Courier-Bold.afm +342 -0
data/lib/rpdf2txt/data/fonts/Courier-BoldOblique.afm +342 -0
data/lib/rpdf2txt/data/fonts/Courier-Oblique.afm +342 -0
data/lib/rpdf2txt/data/fonts/Courier.afm +342 -0
data/lib/rpdf2txt/data/fonts/Helvetica-Bold.afm +2827 -0
data/lib/rpdf2txt/data/fonts/Helvetica-BoldOblique.afm +2827 -0
data/lib/rpdf2txt/data/fonts/Helvetica-Oblique.afm +3051 -0
data/lib/rpdf2txt/data/fonts/Helvetica.afm +3051 -0
data/lib/rpdf2txt/data/fonts/License-Adobe.txt +65 -0
data/lib/rpdf2txt/data/fonts/Symbol.afm +213 -0
data/lib/rpdf2txt/data/fonts/Times-Bold.afm +2588 -0
data/lib/rpdf2txt/data/fonts/Times-BoldItalic.afm +2384 -0
data/lib/rpdf2txt/data/fonts/Times-Italic.afm +2667 -0
data/lib/rpdf2txt/data/fonts/Times-Roman.afm +2419 -0
data/lib/rpdf2txt/data/fonts/ZapfDingbats.afm +225 -0
data/lib/rpdf2txt/data/pdfattributes.grammar +32 -0
data/lib/rpdf2txt/data/pdfattributes.rb +71 -0
data/lib/rpdf2txt/data/pdftext.grammar +102 -0
data/lib/rpdf2txt/data/pdftext.rb +146 -0
data/lib/rpdf2txt/default_handler.rb +352 -0
data/lib/rpdf2txt/lzw.rb +69 -0
data/lib/rpdf2txt/object.rb +1114 -0
data/lib/rpdf2txt/parser.rb +169 -0
data/lib/rpdf2txt/symbol.rb +408 -0
data/lib/rpdf2txt/text.rb +182 -0
data/lib/rpdf2txt/text_state.rb +434 -0
data/lib/rpdf2txt/textparser.rb +42 -0
data/test/data/3392_obj +0 -0
data/test/data/397_decrypted +15 -0
data/test/data/450_decrypted +153 -0
data/test/data/450_obj +0 -0
data/test/data/452_decrypted +125 -0
data/test/data/454_decrypted +108 -0
data/test/data/456_decrypted +106 -0
data/test/data/458_decrypted +111 -0
data/test/data/458_obj +0 -0
data/test/data/460_decrypted +118 -0
data/test/data/460_obj +0 -0
data/test/data/463_decrypted +117 -0
data/test/data/465_decrypted +107 -0
data/test/data/465_obj +0 -0
data/test/data/90_obj +0 -0
data/test/data/90_obj_comp +1 -0
data/test/data/decrypted +0 -0
data/test/data/encrypt_obj +0 -0
data/test/data/encrypt_string +0 -0
data/test/data/encrypt_string_128bit +0 -0
data/test/data/encrypted_object_stream.pdf +0 -0
data/test/data/firststream +1 -0
data/test/data/index.pdfobj +0 -0
data/test/data/index_2bit.pdfobj +0 -0
data/test/data/index_masked.pdfobj +0 -0
data/test/data/indexed.pdfobj +0 -0
data/test/data/indexed_2bit.pdfobj +0 -0
data/test/data/indexed_masked.pdfobj +0 -0
data/test/data/inline.png +0 -0
data/test/data/logo.png +0 -0
data/test/data/lzw.pdfobj +0 -0
data/test/data/lzw_index.pdfobj +0 -0
data/test/data/page_tree.pdf +148 -0
data/test/data/pdf_20.png +0 -0
data/test/data/pdf_21.png +0 -0
data/test/data/pdf_22.png +0 -0
data/test/data/pdf_50.png +0 -0
data/test/data/png.pdfobj +0 -0
data/test/data/space_bug_stream.txt +119 -0
data/test/data/stream.txt +292 -0
data/test/data/stream_kerning_bug.txt +13 -0
data/test/data/stream_kerning_bug2.txt +6 -0
data/test/data/test.pdf +0 -0
data/test/data/test.txt +8 -0
data/test/data/test_text.txt +42 -0
data/test/data/working_obj +0 -0
data/test/data/working_obj2 +0 -0
data/test/mock.rb +149 -0
data/test/suite.rb +30 -0
data/test/test_pdf_object.rb +1802 -0
data/test/test_pdf_parser.rb +1340 -0
data/test/test_pdf_text.rb +789 -0
data/test/test_space_bug_05_2004.rb +87 -0
data/test/test_stream.rb +194 -0
data/test/test_text_state.rb +315 -0
data/usage-en.txt +112 -0
data/user-stories/UserStories_Rpdf2Txt.txt +34 -0
data/user-stories/documents/swissmedicjournal/04_2004.pdf +0 -0
metadata +220 -0

@@ -0,0 +1,364 @@
+require 'rpdf2txt-rockit/syntax_tree'
+require 'rpdf2txt-rockit/sourcecode_dumpable'
+require 'rpdf2txt-rockit/bounded_lru_cache'
+class TokenRegexp < Regexp
+  def initialize(aStringOrRegexp)
+    if aStringOrRegexp.class == String
+      @string = aStringOrRegexp
+      @string = "^(" + @string + ")" unless @string[0,1] == "^"
+      super @string
+    elsif aStringOrRegexp.kind_of?(Regexp)
+      @string = aStringOrRegexp.source
+      @string = "^(" + @string + ")" unless @string[0,1] == "^"
+      super(@string, aStringOrRegexp.options)
+    else
+      raise ArgumentError
+    end
+  end
+  def string
+    if @string =~ /\^\(.*\)/n
+      @string[2...-1]
+    else
+      @string
+    end
+  end
+end
+# Short hands for composing token regexp's
+def tr(aStringOrRegexp)
+  aStringOrRegexp = aStringOrRegexp.source if aStringOrRegexp.class == Regexp
+  TokenRegexp.new(aStringOrRegexp)
+end
+def tre_compose(tokens, map, separator)
+  str = (map % tokens[0].string)
+  tokens[1..-1].each {|token| str += separator + (map % token.string)}
+  tr(str)
+end
+def ror(*tokens)
+  tre_compose(tokens, "(%s)", "|")
+end
+def rseq(*tokens)
+  tre_compose(tokens, "(%s)", "")
+end
+def r?(tokenregexp)
+  tr("(%s)?" % tokenregexp.string)
+end
+def rm(tokenregexp)
+  tr("(%s)*" % tokenregexp.string)
+end
+def rp(tokenregexp)
+  tr("(%s)+" % tokenregexp.string)
+end
+class Token
+  include SourceCodeDumpable
+  attr_reader :skip, :regexp
+  attr_accessor :name
+  def initialize(aString, aStringOrRegexpOrTokenRegexp = "", *options)
+    @name, @regexp = aString, TokenRegexp.new(aStringOrRegexpOrTokenRegexp)
+    parse_options(options)
+  end
+  def hash
+    @hashvalue || (@hashvalue = [self.class, @name, @regexp, @skip].hash)
+  end
+  def parse_options(options)
+    option_names = options.map do |option|
+      if option.kind_of? Symbol
+	option.id2name.downcase
+      else
+	option.downcase
+      end
+    end
+    @skip = true if option_names.include? "skip"
+  end
+  def ==(other)
+    other.class == self.class and
+      other.name == name and
+      other.regexp.inspect == regexp.inspect and
+      other.skip == skip
+  end
+  def match(aString)
+    @regexp.match aString
+  end
+  def value(lexeme)
+    # TODO: Add blocks that map lexeme's to values.
+    lexeme
+  end
+  def create_tree(lexeme, position)
+    t = SyntaxTree.new(@name, ["lexeme", "value"], [value(lexeme), lexeme])
+    t.attributes[:position] = position
+    t
+  end
+  def to_src(assignToName = nil, nameHash = {})
+    if skip
+      assign_to(assignToName, new_of_my_type(name, as_code(regexp.to_src), :Skip))
+    else
+      assign_to(assignToName, new_of_my_type(name, as_code(regexp.to_src)))
+    end
+  end
+  def inspect
+    name || regexp.inspect
+    #osrc = options_to_src
+    #"#{name} = #{regexp.inspect} #{osrc.length>0 ? '['+osrc+']' : ''}"
+  end
+  protected
+  def options_to_src
+    if skip
+      ":Skip"
+    else
+      ""
+    end
+  end
+end
+class EofToken < Token
+  def initialize(*args)
+    # Shouldn't match anything but since I'm not sure how to do a regexp
+    # with that chareacteristic we use a highly unlikely string in the mean
+    # time.
+    super("EOF", "�~~��~^^~" + rand(1e10).inspect)
+  end
+  def ==(other)
+    other.class == self.class
+  end
+end
+class EpsilonToken < Token
+  def initialize
+    # Shouldn't match anything but since I'm not sure how to do a regexp
+    # with that chareacteristic we use a highly unlikely string in the mean
+    # time.
+    super("epsilon", "�~~��~^^~" + rand(1e10).inspect)
+  end
+  def ==(other)
+    other.class == self.class
+  end
+end
+class StringToken < Token
+  def initialize(name, string = name)
+    @string = string
+    super(name, Regexp.escape(string))
+  end
+  def to_src(assignToName = nil, nameHash = {})
+    assign_to(assignToName, new_of_my_type(name, @string))
+  end
+  def hash
+    @hashvalue || (@hashvalue = [self.class, @name, @string].hash)
+  end
+  def to_s
+    "#{id} #{@string} #{name.inspect} #{hash}"
+  end
+  def inspect
+    @string.inspect
+  end
+end
+def string_token(string)
+  StringToken.new("StrToken" + string.hash.inspect, string)
+end
+class RegexpToken < Token
+  def initialize(aString, regexp, *options)
+    @name, @regexp = aString, regexp
+    parse_options(options)
+  end
+end
+def regexp_token(regexp, *options)
+  RegexpToken.new("RegexpToken" + regexp.hash.inspect, regexp, *options)
+end
+def t(name, re, *options)
+  if re.class == String
+    StringToken.new("StrToken" + re.hash.inspect, re)
+  else
+    Token.new(name, re, *options)
+  end
+end
+require 'rpdf2txt-rockit/stringscanner' # DO *NOT* alter since install.rb exploits formatting
+# Forking lexers return LexerToken's with the info about a matching token
+# and the lexer to access for next tokens.
+class LexerToken
+  attr_reader :lexeme, :token_type, :lexer, :position
+  def initialize(lexeme, tokenType, lexer, position = nil)
+    @lexeme, @token_type, @lexer = lexeme, tokenType, lexer
+    @position = position
+  end
+  def create_tree
+    @token_type.create_tree(@lexeme, @position)
+  end
+  def inspect
+    "LT(#{lexeme.inspect}, #{token_type.name})"
+  end
+end
+class LexerPosition
+  attr_reader :row, :column, :char_position
+  def initialize(row = 0, column = 0, char_position = 0)
+    @row, @column, @char_position = row, column, char_position
+  end
+  def +(aString)
+    char_position = @char_position + aString.length
+    num_newlines = aString.count "\r\n"
+    row = @row
+    if num_newlines == 0
+      column = @column + aString.length
+    else
+      row += num_newlines
+      begin
+	column = aString.split("\n").last.split("\r").last.length
+      rescue NameError
+	column = 0
+      end
+    end
+    LexerPosition.new(row, column, char_position)
+  end
+  def inspect
+    "(row=#{row},column=#{@column})"
+  end
+end
+# NOTE: If more performance is needed it might be good to use one char of
+# lookahead to group tokens and reduce the number of tokens that needs to
+# be tested.
+class ForkingRegexpLexer
+  attr_accessor :position
+  attr_reader :scanner, :tokens, :lexer_cache, :eof_token
+  protected :lexer_cache
+  def initialize(tokens, eofToken = nil)
+    @tokens = tokens
+    @eof_token = tokens.detect {|t| t.kind_of?(EofToken)}
+    @tokens.delete_if {|t| t.kind_of?(EofToken)}
+  end
+  @@eof_token = EofToken.new
+  def init(aString)
+    @position, @current_tokens = LexerPosition.new, nil
+    @scanner = StringScanner.new(aString)
+    # We speed things up by only having one lexer at each position. Since there
+    # are typically only a small number of positions we use a BoundedLruCache
+    # of size 20 to keep them in. The cache throws out oldest (least recently
+    # used, NOTE! accessed in the cache not used in the parser) lexer when
+    # new one inserted. This is to keep the memory consumption down.
+    #
+    @lexer_cache = BoundedLruCache.new(20)
+  end
+  # Refactor! Complex interactions when tokens are skipped since the next_lexer
+  # update "our" scanner. Find cleaner way of expressing this!
+  def peek
+    return @current_tokens if @current_tokens
+    scanner.pointer = @position.char_position
+    @current_tokens = Array.new
+    tokens.each do |token|
+      if (match = scanner.check(token.regexp))
+	if token.skip
+	  # Token to be skipped => return tokens matching after the skipped one
+	  @current_tokens.concat next_lexer(match).peek
+	  scanner.pointer = @position.char_position
+	else
+	  @current_tokens.push LexerToken.new(match, token,
+					      next_lexer(match), @position)
+	end
+      end
+    end
+    if @current_tokens.length == 0
+      @string_length = scanner.string.length unless @string_length
+      if @position.char_position >= @string_length
+	@current_tokens.push LexerToken.new(nil, eof_token || @@eof_token,
+					    nil, @position)
+      end
+    end
+    return @current_tokens
+  end
+  def inspect
+    "Lexer(#{@position.inspect})"
+  end
+  protected
+  def next_lexer(matchingString)
+    pos = @position + matchingString
+    #create_next_lexer(pos)
+    char_pos = pos.char_position
+    lexer = self.lexer_cache[char_pos]
+    self.lexer_cache[char_pos] = lexer = create_next_lexer(pos) unless lexer
+    lexer
+  end
+  def create_next_lexer(pos)
+    ReferencingRegexpLexer.new(self, pos)
+  end
+end
+class ReferencingRegexpLexer < ForkingRegexpLexer
+  def initialize(aForkingRegexpLexer, position)
+    @parent_lexer, @position = aForkingRegexpLexer, position
+  end
+  def inspect
+    "RefLexer(#{@position.inspect})"
+  end
+  def scanner
+    @parent_lexer.scanner
+  end
+  protected
+  def create_next_lexer(pos)
+    ReferencingRegexpLexer.new(@parent_lexer, pos)
+  end
+  def lexer_cache
+    @parent_lexer.lexer_cache
+  end
+  def eof_token
+    @parent_lexer.eof_token
+  end
+  def tokens
+    @parent_lexer.tokens
+  end
+end

data/lib/rpdf2txt-rockit/version.rb ADDED

@@ -0,0 +1,3 @@
+def rockit_version
+  "0.3.8"
+end

data/lib/rpdf2txt/attributesparser.rb ADDED

@@ -0,0 +1,42 @@
+#!/usr/bin/env ruby
+#
+#	Rpdf2txt -- PDF to Text Parser
+#	Copyright (C) 2003 Andreas Schrafl, Hannes Wyss
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+#
+#	ywesee - intellectual capital connected, Winterthurerstrasse 52, CH-8006 Z�rich, Switzerland
+#	hwyss@ywesee.com,	aschrafl@ywesee.com
+#
+# AttributesParser -- Rpdf2txt -- 19.12.2002 -- hwyss@ywesee.com
+require 'rpdf2txt-rockit/rockit'
+module Rpdf2txt
+		GRAMMAR_PATH = File.expand_path('data/pdfattributes.grammar', File.dirname(__FILE__))
+		PARSER_PATH = File.expand_path('data/pdfattributes.rb', File.dirname(__FILE__))
+		def attributes_parser(grammar_path=GRAMMAR_PATH, parser_path=PARSER_PATH)
+			oldpath = File.dirname(grammar_path) << "/_" << File.basename(grammar_path)
+			src = File.read(grammar_path)
+			unless(File.exists?(oldpath) && File.read(oldpath)==src)
+				File.delete(oldpath) if File.exists?(oldpath)
+				Parse.generate_parser_from_file_to_file(grammar_path, parser_path, '_attr_parser', 'Rpdf2txt')
+				File.open(oldpath, 'w') { |f| f << src }
+			end
+			require parser_path
+			Rpdf2txt._attr_parser
+		end
+		module_function :attributes_parser
+end

data/lib/rpdf2txt/cmapparser.rb ADDED

@@ -0,0 +1,65 @@
+#!/usr/bin/env ruby
+#
+#	Rpdf2txt -- PDF to Text Parser
+#	Copyright (C) 2003 Andreas Schrafl, Hannes Wyss
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+#
+#	ywesee - intellectual capital connected, Winterthurerstrasse 52, CH-8006 Z�rich, Switzerland
+#	hwyss@ywesee.com,	aschrafl@ywesee.com
+#
+# TextParser -- Rpdf2txt -- 04.11.2004 -- mwalder@ywesee.com
+# rwaltert@ywesee.com
+require 'rpdf2txt-rockit/rockit'
+module Rpdf2txt
+		CMAP_GRAMMAR = File.expand_path('data/cmap.grammar',
+			File.dirname(__FILE__))
+		CMAP_PARSER = File.expand_path('data/cmap.rb',
+			File.dirname(__FILE__))
+		CMAP_RANGE_GRAMMAR = File.expand_path('data/cmap_range.grammar',
+			File.dirname(__FILE__))
+		CMAP_RANGE_PARSER = File.expand_path('data/cmap_range.rb',
+			File.dirname(__FILE__))
+		def Rpdf2txt.cmap_parser(grammar_path=CMAP_GRAMMAR,
+			parser_path=CMAP_PARSER)
+			oldpath = File.dirname(grammar_path) \
+				<< "/_" << File.basename(grammar_path)
+			src = File.read(grammar_path)
+			unless(File.exists?(oldpath) && File.read(oldpath)==src)
+				File.delete(oldpath) if File.exists?(oldpath)
+				Parse.generate_parser_from_file_to_file(grammar_path,
+					parser_path, '_cmap_parser', 'Rpdf2txt')
+				File.open(oldpath, 'w') { |f| f << src }
+			end
+			require parser_path
+			Rpdf2txt._cmap_parser
+		end
+		def Rpdf2txt.cmap_range_parser(grammar_path=CMAP_RANGE_GRAMMAR,
+			parser_path=CMAP_RANGE_PARSER)
+			oldpath = File.dirname(grammar_path) \
+				<< "/_" << File.basename(grammar_path)
+			src = File.read(grammar_path)
+			unless(File.exists?(oldpath) && File.read(oldpath)==src)
+				File.delete(oldpath) if File.exists?(oldpath)
+				Parse.generate_parser_from_file_to_file(grammar_path,
+					parser_path, '_cmap_range_parser', 'Rpdf2txt')
+				File.open(oldpath, 'w') { |f| f << src }
+			end
+			require parser_path
+			Rpdf2txt._cmap_range_parser
+		end
+end