RubyGems - rpdf2txt - Versions diffs - 0.8.2 → 0.8.3 - Mend

rpdf2txt 0.8.2 → 0.8.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

data/.gemtest +0 -0
data/History.txt +4 -0
data/Manifest.txt +0 -4
data/README.txt +16 -3
data/bin/rpdf2txt +4 -1
data/lib/rpdf2txt/data/cmap.rb +10 -9
data/lib/rpdf2txt/data/cmap_range.rb +13 -12
data/lib/rpdf2txt/data/pdfattributes.rb +14 -13
data/lib/rpdf2txt/data/pdftext.rb +19 -18
data/lib/rpdf2txt/object.rb +68 -13
data/lib/rpdf2txt/parser.rb +6 -2
data/lib/rpdf2txt/text.rb +1 -2
data/lib/rpdf2txt/text_state.rb +10 -2
data/lib/rpdf2txt-rockit/rockit.rb +1 -1
data/lib/rpdf2txt-rockit/rockit_grammars_parser.rb +1 -0
data/lib/rpdf2txt-rockit/token.rb +1 -0
data/test/mock.rb +19 -11
data/test/test_object.rb +33 -0
data/test/test_pdf_object.rb +25 -24
data/test/test_pdf_parser.rb +8 -5
data/test/test_pdf_text.rb +11 -10
data/test/test_space_bug_05_2004.rb +2 -1
data/test/test_stream.rb +6 -5
data/test/test_text_state.rb +220 -219
metadata +13 -14
data/config.save +0 -12
data/lib/rpdf2txt/data/_cmap.grammar +0 -11
data/lib/rpdf2txt/data/_cmap_range.grammar +0 -15
data/lib/rpdf2txt/data/_pdfattributes.grammar +0 -32

data/test/mock.rb CHANGED Viewed

@@ -16,7 +16,8 @@
 # along with this program; if not, write to the Free Software
 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-require 'runit/error'
+#require 'runit/error'
+require 'test/unit'
 class Mock
@@ -58,7 +59,8 @@ class Mock
     #
     def __verify
         if @next_call != @mock_calls.length
-            raise RUNIT::AssertionFailedError,
+            #raise RUNIT::AssertionFailedError,
+            raise Test::Unit::AssertionFailedError,
                   "not all expected method calls were made to #{@name}",
                   caller
         end
@@ -78,33 +80,37 @@ private
     #
     def __mock_call( name, args, block )
         if @next_call >= @mock_calls.length
-            raise RUNIT::AssertionFailedError,
+            #raise RUNIT::AssertionFailedError,
+            raise Test::Unit::AssertionFailedError,
                   "unexpected call to #{name} method of #{@name}",
                   caller(2)
         end
         expected_name,body = @mock_calls[@next_call]
         @next_call += 1
         if name != expected_name
-            raise RUNIT::AssertionFailedError,
+            #raise RUNIT::AssertionFailedError,
+            raise Test::Unit::AssertionFailedError,
                   "wrong method called on #{@name}; " +
                       "expected #{expected_name}, was #{name}",
                   caller(2)
         end
         args_length = args.length + (block ? 1 : 0)
         if body.arity < 0
             if (body.arity+1).abs > args_length
-                raise RUNIT::AssertionFailedError,
+                #raise RUNIT::AssertionFailedError,
+                raise Test::Unit::AssertionFailedError,
                       "too few arguments to #{name} method of #{@name}; " +
                           "require #{(body.arity+1).abs}, got #{args.length}",
                       caller(2)
             end
         else
             if body.arity != args_length
-                raise RUNIT::AssertionFailedError,
+                #raise RUNIT::AssertionFailedError,
+                raise Test::Unit::AssertionFailedError,
                       "wrong number of arguments to " +
                           "#{name} method of #{@name}; " +
                           "require #{body.arity}, got #{args.length}",
@@ -120,7 +126,8 @@ private
             end
             if not precondition_ok
-                raise RUNIT::AssertionFailedError,
+                #raise RUNIT::AssertionFailedError,
+                raise Test::Unit::AssertionFailedError,
                     "precondition of #{name} method violated",
                     caller(2)
             end
@@ -135,7 +142,8 @@ private
     #  The name of a precondition for a method
     def Mock.__pre( method )
-        "__pre_#{method.to_i}".intern
+        #"__pre_#{method.to_i}".intern
+        "__pre_#{method}".intern
     end

data/test/test_object.rb ADDED Viewed

@@ -0,0 +1,33 @@
+#!/usr/bin/env ruby
+# encoding: utf-8
+# TestObject -- rpdf2txt -- 26.05.2011 -- mhatakeyama@ywesee.com
+$: << File.expand_path('../lib', File.dirname(__FILE__))
+require 'test/unit'
+require 'flexmock'
+require 'rpdf2txt/object'
+module Rpdf2txt
+  class TestPageLeaf < Test::Unit::TestCase
+    include FlexMock::TestCase
+    def test_merge_snippets
+      pageleaf = Rpdf2txt::PageLeaf.new
+      snippet1 = flexmock('snippet1',
+                          :txt => 'txt1',
+                          :txt= => nil
+                         )
+      snippet2 = flexmock('snippet2',
+                          :txt => 'txt2',
+                          :txt= => nil
+                         )
+      text_snippets = [snippet1, snippet2, snippet2]
+      result = pageleaf.merge_snippets(text_snippets)
+      assert_equal(2, result.length)
+      assert_kind_of(snippet1.class, result[0])
+      assert_kind_of(snippet2.class, result[1])
+    end
+  end
+end

data/test/test_pdf_object.rb CHANGED Viewed

@@ -1,4 +1,5 @@
 #!/usr/bin/env ruby
+# encoding: ascii-8bit
 #
 #	Rpdf2txt -- PDF to Text Parser
 #	Copyright (C) 2003 Andreas Schrafl, Hannes Wyss, Masaomi Hatakeyama
@@ -531,13 +532,13 @@ ET
     class TestEncrypt < Test::Unit::TestCase
         def setup
             file = File.expand_path('./data/encrypt_string', File.dirname(__FILE__))
-            src_encrypt_obj  = File.read(file)
+            src_encrypt_obj  = open(file, 'rb'){|file| file.read}
             @encrypt = Rpdf2txt::PdfEncrypt.new(src_encrypt_obj)
             @encrypt.file_id = '8664e6986751f2a49dccc9a4b40a4f18'
         end
         def test_decrypt
             file = File.expand_path('./data/working_obj', File.dirname(__FILE__))
-            input = File.read(file)
+            input = open(file, 'rb'){|file| file.read}
             pdf_obj = Rpdf2txt::Stream.new(input)
             assert_equal("dc08b36009e48618f99c", @encrypt.decrypt_key(pdf_obj).unpack('h*').first)
             #if the stream could be inflated, the decryption is ok!
@@ -548,7 +549,7 @@ ET
         end
         def test_decrypt2
             file = File.expand_path('./data/90_obj', File.dirname(__FILE__))
-            input = File.read(file)
+            input = open(file, 'rb'){|file| file.read}
             pdf_obj = Rpdf2txt::Stream.new(input)
             assert_equal("7617ca1ac5babcf09cdf", @encrypt.decrypt_key(pdf_obj).unpack('h*').first)
             #if the stream could be inflated, the decryption is ok!
@@ -559,7 +560,7 @@ ET
         end
         def test_decrypt3
             file = File.expand_path('./data/working_obj2', File.dirname(__FILE__))
-            input = File.read(file)
+            input = open(file, 'rb'){|file| file.read}
             pdf_obj = Rpdf2txt::Stream.new(input)
             assert_equal("a9a666959bd64a96551b", @encrypt.decrypt_key(pdf_obj).unpack('h*').first)
             #if the stream could be inflated, the decryption is ok!
@@ -570,7 +571,7 @@ ET
         end
         def test_decrypt5
             file = File.expand_path('./data/458_obj', File.dirname(__FILE__))
-            input = File.read(file)
+            input = open(file, 'rb'){|file| file.read}
             pdf_obj = Rpdf2txt::Stream.new(input)
             #assert_equal("1aaeedd5d5304b79709b", @encrypt.decrypt_key(pdf_obj).unpack('h*').first)
             #if the stream could be inflated, the decryption is ok!
@@ -581,7 +582,7 @@ ET
         end
         def test_decrypt6
             file = File.expand_path('./data/450_obj', File.dirname(__FILE__))
-            input = File.read(file)
+            input = open(file, 'rb'){|file| file.read}
             pdf_obj = Rpdf2txt::Stream.new(input)
             #assert_equal("1aaeedd5d5304b79709b", @encrypt.decrypt_key(pdf_obj).unpack('h*').first)
             #if the stream could be inflated, the decryption is ok!
@@ -592,7 +593,7 @@ ET
         end
         def test_decrypt7
             file = File.expand_path('./data/465_obj', File.dirname(__FILE__))
-            input = File.read(file)
+            input = open(file, 'rb'){|file| file.read}
             pdf_obj = Rpdf2txt::Stream.new(input)
             #assert_equal("1aaeedd5d5304b79709b", @encrypt.decrypt_key(pdf_obj).unpack('h*').first)
             #if the stream could be inflated, the decryption is ok!
@@ -603,7 +604,7 @@ ET
         end
         def test_decrypt_key
             file = File.expand_path('./data/encrypt_obj', File.dirname(__FILE__))
-            src = File.read(file)
+            src = open(file, 'rb'){|file| file.read}
             #byte position important! do not indent these lines!!!
             obj_src =  <<-EOS
 473 0 obj
@@ -618,7 +619,7 @@ endobj
         end
         def test_inflate_obj
             file = File.expand_path('./data/90_obj_comp', File.dirname(__FILE__))
-            input = File.read(file)
+            input = open(file, 'rb'){|file| file.read}
             input = [input].pack('H*')
             #	puts input
             assert_nothing_raised{
@@ -628,7 +629,7 @@ endobj
         end
         def test_parse_encrypt
             file = File.expand_path('./data/encrypt_obj', File.dirname(__FILE__))
-            src = File.read(file)
+            src = open(file, 'rb'){|file| file.read}
             encrypt = Rpdf2txt::PdfEncrypt.new(src)
             encrypt.file_id = '8664e6986751f2a49dccc9a4b40a4f18'
             assert_equal("00ecc7a7bf8d68c564a21b98258b1dbff2aaf8d24bfdbaa74a9a073467d896b6", encrypt.user_key.unpack("H*").first)
@@ -639,7 +640,7 @@ endobj
         end
         def test_endianess
             file = File.expand_path('./data/encrypt_obj', File.dirname(__FILE__))
-            src = File.read(file)
+            src = open(file, 'rb'){|file| file.read}
             encrypt = Rpdf2txt::PdfEncrypt.new(src)
             encrypt.big_endian?
         end
@@ -648,13 +649,13 @@ endobj
     def setup
       file = File.expand_path('./data/encrypt_string_128bit',
                               File.dirname(__FILE__))
-      src_encrypt_obj  = File.read(file)
+      src_encrypt_obj  = open(file, 'rb'){|file| file.read}
       @encrypt = Rpdf2txt::PdfEncrypt.new(src_encrypt_obj)
       @encrypt.file_id = 'D816A5E838D50653C19DB62504229EB6'
     end
     def test_decrypt8
       file = File.expand_path('./data/3392_obj', File.dirname(__FILE__))
-      input = File.read(file)
+      input = open(file, 'rb'){|file| file.read}
       pdf_obj = Rpdf2txt::Stream.new(input)
       #if the stream could be inflated, the decryption is ok!
       assert_nothing_raised{
@@ -1231,7 +1232,7 @@ perm\351abilit\351 vasculaire et une inflammation.
 Swissmedic Journal 03/2006                                                                            226
             EOS
       result = handler.out.strip
-=begin
+=begin keep
       [expected.size, result.size].max.times do |idx|
         unless result[idx] == expected[idx]
           flunk "unexpected result: (#{result[idx]}/#{expected[idx]} at #{idx}) ...#{expected[idx-10,20].inspect}..."
@@ -1653,7 +1654,7 @@ Seite 1 von 1083
       expected = "HEUMANN PH GMBH&CO. KG                                                                                20    St"
       assert_equal(expected.strip, handler.out.strip)
     end
-=begin
+=begin keep
     def test_text_space_bug2
       stream = Stream.new
       path = File.expand_path('data/space_bug_stream2.txt',
@@ -1705,7 +1706,7 @@ endobj
   class TestImage < Test::Unit::TestCase
     def test_png
       path = File.expand_path('data/png.pdfobj', File.dirname(__FILE__))
-      src = File.read(path)
+      src = open(path, 'rb'){|file| file.read}
       obj = Image.new(src)
       assert_nothing_raised { obj.image }
       path = File.expand_path('data/logo.png', File.dirname(__FILE__))
@@ -1714,10 +1715,10 @@ endobj
     end
     def test_indexed
       path = File.expand_path('data/index.pdfobj', File.dirname(__FILE__))
-      src = File.read(path)
+      src = open(path, 'rb'){|file| file.read}
       index = Stream.new(src)
       path = File.expand_path('data/indexed.pdfobj', File.dirname(__FILE__))
-      src = File.read(path)
+      src = open(path, 'rb'){|file| file.read}
       obj = Image.new(src)
       obj.build_tree(51 => index)
       assert_nothing_raised { obj.image }
@@ -1727,10 +1728,10 @@ endobj
     end
     def test_indexed_2bit
       path = File.expand_path('data/index_2bit.pdfobj', File.dirname(__FILE__))
-      src = File.read(path)
+      src = open(path, 'rb'){|file| file.read}
       index = Stream.new(src)
       path = File.expand_path('data/indexed_2bit.pdfobj', File.dirname(__FILE__))
-      src = File.read(path)
+      src = open(path, 'rb'){|file| file.read}
       obj = Image.new(src)
       obj.build_tree(21 => index)
       assert_nothing_raised { obj.image }
@@ -1740,10 +1741,10 @@ endobj
     end
     def test_indexed_masked
       path = File.expand_path('data/index_masked.pdfobj', File.dirname(__FILE__))
-      src = File.read(path)
+      src = open(path, 'rb'){|file| file.read}
       index = Stream.new(src)
       path = File.expand_path('data/indexed_masked.pdfobj', File.dirname(__FILE__))
-      src = File.read(path)
+      src = open(path, 'rb'){|file| file.read}
       obj = Image.new(src)
       obj.build_tree(21 => index)
       assert_nothing_raised { obj.image }
@@ -1759,10 +1760,10 @@ endobj
     end
     def test_lzw_image
       path = File.expand_path('data/lzw_index.pdfobj', File.dirname(__FILE__))
-      src = File.read(path)
+      src = open(path, 'rb'){|file| file.read}
       index = Stream.new(src)
       path = File.expand_path('data/lzw.pdfobj', File.dirname(__FILE__))
-      src = File.read(path)
+      src = open(path, 'rb'){|file| file.read}
       obj = Image.new(src)
       obj.build_tree(21 => index)
       assert_nothing_raised { obj.image }

data/test/test_pdf_parser.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 #!/usr/bin/env ruby
-#
+# encoding: ascii-8bit
 #	Rpdf2txt -- PDF to Text Parser
 #	Copyright (C) 2003 Andreas Schrafl, Hannes Wyss, Masaomi Hatakeyama
 #
@@ -75,7 +75,7 @@ class TestParser < Test::Unit::TestCase
 	end
 	def setup
 		file = File.expand_path('./data/page_tree.pdf', File.dirname(__FILE__))
-		input = File.read(file)
+		input = open(file, 'rb'){|file| file.read}
 		@parser = Rpdf2txt::Parser.new(input)
 	end
 	def test_object_catalogue
@@ -101,7 +101,7 @@ class TestParser < Test::Unit::TestCase
 	end
     def test_rebuild_object_catalogue
       file = File.expand_path('./data/encrypted_object_stream.pdf', File.dirname(__FILE__))
-      input = File.read(file)
+      input = open(file, 'rb'){|file| file.read}
       parser = Rpdf2txt::Parser.new(input)
       cat = parser.object_catalogue
       assert_equal(3, cat.length)
@@ -322,7 +322,7 @@ endobj
 		leaf = Rpdf2txt::PageLeaf.new
 		expected = <<-EOS
 Paroxetin besitzt eine selektive Wirkung; in-vitro Studien haben gezeigt, dass es, im Gegensatz zu
-trizyklischen Antidepressiva, eine geringe Affinit\344t f\374r  a1-, a2- und b-Adrenozeptoren sowie f\374r
+trizyklischen Antidepressiva, eine geringe Affinit\344t f\374r a1-, a2- und b-Adrenozeptoren sowie f\374r
 Dopamin (D2)-, 5-HT1-artige, 5-HT2 und Histamin (H1)-Rezeptoren aufweist. Das Fehlen einer
 		EOS
 		handler = Rpdf2txt::SimpleHandler.new
@@ -464,7 +464,9 @@ endobj
 			36 => 8805,
 		}
 		font2.cmap = cmap
+#require 'pp'
+#print "font2="
+#pp font2
 		fonts = {
 			:tt0	=> font0,
 			:tt2	=> font2,
@@ -472,6 +474,7 @@ endobj
 		txt.current_page = FontDonorStub.new(fonts)
 		leaf = Rpdf2txt::PageLeaf.new
 		expected = "In Studie 1 evaluierte man 271 Patienten mit einer m\344ssigen bis schweren aktiven rheumatoiden \nArthritis, die \26318 Jahre alt waren, bei denen die Therapie mit mindestens einem, aber mit nicht mehr \n"
+		#expected = "In Studie 1 evaluierte man 271 Patienten mit einer m\344ssigen bis schweren aktiven rheumatoiden \nArthritis, die 18 Jahre alt waren, bei denen die Therapie mit mindestens einem, aber mit nicht mehr \n"
 		handler = Rpdf2txt::SimpleHandler.new
 		leaf.join_snippets(txt.scan, handler)
 		result = handler.out

data/test/test_pdf_text.rb CHANGED Viewed

@@ -1,4 +1,5 @@
 #!/usr/bin/env ruby
+# encoding: ascii-8bit
 #
 #	Rpdf2txt -- PDF to Text Parser
 #	Copyright (C) 2003 Andreas Schrafl, Hannes Wyss
@@ -36,7 +37,7 @@ module Rpdf2txt
 	class TestText < Test::Unit::TestCase
 		def setup
 			path = File.expand_path("./data/test_text.txt", File.dirname(__FILE__))
-			src = File.read(path)
+			src = open(path, 'rb'){|file| file.read}
 			#@handler = Rpdf2txt::HTMLHandler.new
 			@text=Rpdf2txt::Text.new(src)
 		end
@@ -78,7 +79,7 @@ ET
 			ast = Rpdf2txt.text_parser.parse(@text.src)
 			assert_equal("-0.0002", ast.values.first.charspace.value)
 			text_state = Mock.new("text_state")
-			text_state.__next(:transformation_matrix=) {}
+			text_state.__next(:transformation_matrix=) {|*x|}
 			@text.text_state = text_state
 			text_state.__next(:set_char_spacing){|value|
 				assert_equal("-0.0002", value)
@@ -94,7 +95,7 @@ BT
 ET
 			EOS
 			text_state = Mock.new("text_state")
-			text_state.__next(:transformation_matrix=) {}
+			text_state.__next(:transformation_matrix=) {|*x|}
 			@text.text_state = text_state
 			text_state.__next(:update_x){ |x|
 				assert_equal(-36.7896, x)
@@ -116,10 +117,10 @@ BT
 ET
 			EOS
 			text_state = Mock.new("text_state")
-			text_state.__next(:transformation_matrix=) {}
+			text_state.__next(:transformation_matrix=) {|*x|}
 			@text.text_state = text_state
-			text_state.__next(:update_x){}
-			text_state.__next(:update_y){}
+			text_state.__next(:update_x){|*x|}
+			text_state.__next(:update_y){|*x|}
 			@text.scan
 			text_state.__verify
 		end
@@ -132,7 +133,7 @@ BT
 ET
 			EOS
 			text_state = Mock.new("text_state")
-			text_state.__next(:transformation_matrix=) {}
+			text_state.__next(:transformation_matrix=) {|*x|}
 			@text.text_state = text_state
 			current_page.__next(:font){ |font|
 				assert_equal(:f16, font)
@@ -151,7 +152,7 @@ BT
 ET
 		EOS
 			text_state = Mock.new("text_state")
-			text_state.__next(:transformation_matrix=) {}
+			text_state.__next(:transformation_matrix=) {|*x|}
 			@text.text_state = text_state
 			current_page.__next(:font){ |font|
 				assert_equal(:c2_0, font)
@@ -169,7 +170,7 @@ BT
 ET
 			EOS
 			text_state = Mock.new("text_state")
-			text_state.__next(:transformation_matrix=) {}
+			text_state.__next(:transformation_matrix=) {|*x|}
 			@text.text_state = text_state
 			text_state.__next(:set_xscale){|x|
 				assert_equal("10", x)
@@ -199,7 +200,7 @@ BT
 ET
 			EOS
 			text_state = Mock.new("text_state")
-			text_state.__next(:transformation_matrix=) {}
+			text_state.__next(:transformation_matrix=) {|*x|}
 			@text.text_state = text_state
 			text_state.__next(:set_word_spacing){ |wordspace|
 				assert_equal('0.0000', wordspace)

data/test/test_space_bug_05_2004.rb CHANGED Viewed

@@ -70,7 +70,8 @@ endobj
 			font30 = Font.new(font30_src) # WinAnsi Encoded
 			path = File.expand_path('data/space_bug_stream.txt',
 				File.dirname(__FILE__))
-			stream = Stream.new(File.read(path))
+            stream = open(path, 'rb'){|file| file.read}
+			stream = Stream.new(stream)
 			page = FontDonor.new
 			page.fonts = {
 				:f3		=>	font3,

data/test/test_stream.rb CHANGED Viewed

@@ -1,4 +1,5 @@
 #!/usr/bin/env ruby
+# encoding: ascii-8bit
 # TestStream -- rpdf2txt -- 01.06.2005 -- hwyss@ywesee.com
 $: << File.expand_path('../lib', File.dirname(__FILE__))
@@ -32,13 +33,13 @@ BT /F1 16 Tf 1 0 0 -1 0 14.347 Tm(Ponstan
     def test_decode_raw_stream
       file = File.expand_path('./data/firststream',
         File.dirname(__FILE__))
-      deflated = File.read(file)
+      deflated = open(file, 'rb'){|file| file.read}
       src = "stream\n#{deflated}endstream"
       stream = Rpdf2txt::Stream.new(src)
       stream.attributes.store(:filter, '/FlateDecode')
       file = File.expand_path('./data/test.txt',
         File.dirname(__FILE__))
-      expected = File.read(file)
+      expected = open(file, 'rb'){|file| file.read}
       assert_equal(expected, stream.decode_raw_stream)
     end
     def test_raw_stream
@@ -51,15 +52,15 @@ BT /F1 16 Tf 1 0 0 -1 0 14.347 Tm(Ponstan
     end
     def test_decoded_stream2
       file = File.expand_path('./data/firststream', File.dirname(__FILE__))
-      @stream.raw_stream = File.read(file)
+      @stream.raw_stream = open(file, 'rb'){|file| file.read}
       @stream.attributes[:filter] = "/FlateDecode"
       file = File.expand_path('./data/test.txt', File.dirname(__FILE__))
-      expected = File.read(file)
+      expected = open(file, 'rb'){|file| file.read}
       assert_equal(expected, @stream.decoded_stream)
     end
     def test_extract_text_objects
       file = File.expand_path('./data/stream.txt', File.dirname(__FILE__))
-      @stream.decoded_stream = File.read(file)
+      @stream.decoded_stream = open(file, 'rb'){|file| file.read}
       result = @stream.extract_text_objects(nil, TextState.new).select { |res|
         res.is_a?(TextState)
       }