rpdf2txt 0.8.3 → 0.8.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +4 -0
- data/Manifest.txt +6 -1
- data/lib/rpdf2txt/data/_cmap.grammar +11 -0
- data/lib/rpdf2txt/data/_cmap_range.grammar +15 -0
- data/lib/rpdf2txt/data/_pdfattributes.grammar +32 -0
- data/lib/rpdf2txt/data/_pdftext.grammar +102 -0
- data/lib/rpdf2txt/parser.rb +1 -1
- metadata +10 -6
    
        data/History.txt
    CHANGED
    
    
    
        data/Manifest.txt
    CHANGED
    
    | @@ -1,5 +1,5 @@ | |
| 1 | 
            -
            LICENCE
         | 
| 2 1 | 
             
            History.txt
         | 
| 2 | 
            +
            LICENCE
         | 
| 3 3 | 
             
            Manifest.txt
         | 
| 4 4 | 
             
            README.txt
         | 
| 5 5 | 
             
            Rakefile
         | 
| @@ -31,6 +31,10 @@ lib/rpdf2txt-rockit/token.rb | |
| 31 31 | 
             
            lib/rpdf2txt-rockit/version.rb
         | 
| 32 32 | 
             
            lib/rpdf2txt/attributesparser.rb
         | 
| 33 33 | 
             
            lib/rpdf2txt/cmapparser.rb
         | 
| 34 | 
            +
            lib/rpdf2txt/data/_cmap.grammar
         | 
| 35 | 
            +
            lib/rpdf2txt/data/_cmap_range.grammar
         | 
| 36 | 
            +
            lib/rpdf2txt/data/_pdfattributes.grammar
         | 
| 37 | 
            +
            lib/rpdf2txt/data/_pdftext.grammar
         | 
| 34 38 | 
             
            lib/rpdf2txt/data/cmap.grammar
         | 
| 35 39 | 
             
            lib/rpdf2txt/data/cmap.rb
         | 
| 36 40 | 
             
            lib/rpdf2txt/data/cmap_range.grammar
         | 
| @@ -111,6 +115,7 @@ test/data/working_obj | |
| 111 115 | 
             
            test/data/working_obj2
         | 
| 112 116 | 
             
            test/mock.rb
         | 
| 113 117 | 
             
            test/suite.rb
         | 
| 118 | 
            +
            test/test_object.rb
         | 
| 114 119 | 
             
            test/test_pdf_object.rb
         | 
| 115 120 | 
             
            test/test_pdf_parser.rb
         | 
| 116 121 | 
             
            test/test_pdf_text.rb
         | 
| @@ -0,0 +1,15 @@ | |
| 1 | 
            +
            Grammar CMap
         | 
| 2 | 
            +
            	Tokens
         | 
| 3 | 
            +
            		SPACE = /\s+/n [:Skip]
         | 
| 4 | 
            +
            		HEXSNIPPET = /[0-9A-F]+/in
         | 
| 5 | 
            +
            	Productions
         | 
| 6 | 
            +
            		HexArray			->	RangeDef+
         | 
| 7 | 
            +
            											[^: values ]
         | 
| 8 | 
            +
            		RangeDef			->	HexElement HexElement HexElement
         | 
| 9 | 
            +
            											[: start, stop, offset ]
         | 
| 10 | 
            +
            									|		HexElement HexElement Explicit
         | 
| 11 | 
            +
            											[: start, stop, explicit ]
         | 
| 12 | 
            +
            		Explicit			->	'[' HexElement+ ']'
         | 
| 13 | 
            +
            											[^: _, explicit, _ ]
         | 
| 14 | 
            +
            	  HexElement		->	'<' HEXSNIPPET '>'
         | 
| 15 | 
            +
            											[^: _, hexsnip, _ ]
         | 
| @@ -0,0 +1,32 @@ | |
| 1 | 
            +
            Grammar PdfAttributes
         | 
| 2 | 
            +
            	Tokens
         | 
| 3 | 
            +
            		IDENTIFIER = /^\/[\w:+\#\-\.]*(, ?[\w:+\#\-\.]*)*/n
         | 
| 4 | 
            +
            		NUMERIC = /-?[0-9]+([.,][0-9]+)?/n
         | 
| 5 | 
            +
            		REFERENCE = /[0-9]+\s+[0-9]+\s+R/n
         | 
| 6 | 
            +
            		WORD = /[a-z\.]{2,}/in
         | 
| 7 | 
            +
            		SPACE = /\s+/mn [:Skip]
         | 
| 8 | 
            +
            		COMMENT = /%[^%]*%/mn [:Skip]
         | 
| 9 | 
            +
            		FILEIDENTIFIER = /<[a-zA-Z0-9\n]+>/n
         | 
| 10 | 
            +
            	Productions
         | 
| 11 | 
            +
            		Expr				->	IDENTIFIER [^: val]
         | 
| 12 | 
            +
            								|		Hash [^: val]
         | 
| 13 | 
            +
            								|		Array [^: val]
         | 
| 14 | 
            +
            								|		REFERENCE [^: val]
         | 
| 15 | 
            +
            								|		NUMERIC [^: val]
         | 
| 16 | 
            +
            								|		Text [^: val]
         | 
| 17 | 
            +
            								|		Date [^: val]
         | 
| 18 | 
            +
            								|		WORD [^: val]
         | 
| 19 | 
            +
            								|   FILEIDENTIFIER [^: val]
         | 
| 20 | 
            +
            		Array				->	'[' ArrayElements ']'
         | 
| 21 | 
            +
            										[: _, values, _]
         | 
| 22 | 
            +
            								|  '[' ']'
         | 
| 23 | 
            +
            										[: _, _]
         | 
| 24 | 
            +
            		ArrayElements -> ArrayElement+
         | 
| 25 | 
            +
            										[^: values]
         | 
| 26 | 
            +
            		ArrayElement -> (Array|Hash|NUMERIC|IDENTIFIER|REFERENCE|WORD|FILEIDENTIFIER|Text)
         | 
| 27 | 
            +
            										[^: _]
         | 
| 28 | 
            +
            		Hash				->	'<<' (IDENTIFIER Expr)* '>>'
         | 
| 29 | 
            +
            										[: _, pairs, _]
         | 
| 30 | 
            +
            		Date				->	'(D:' /[\d+']+/n ')'
         | 
| 31 | 
            +
            		Text				->	/\(([^\)\\]|\\[\(\)\\]?)*?\)/n
         | 
| 32 | 
            +
            										[: text]
         | 
| @@ -0,0 +1,102 @@ | |
| 1 | 
            +
            Grammar PdfText
         | 
| 2 | 
            +
            	Tokens
         | 
| 3 | 
            +
            		NUMERIC = /-?(([0-9]*[.,_][0-9]+)|([0-9]+))/n
         | 
| 4 | 
            +
            		SPACE = /\s+/n [:Skip]
         | 
| 5 | 
            +
            		HEXSNIPPET = /[0-9A-F]+/in
         | 
| 6 | 
            +
            		ALPHANUMERIC  = /[A-Z0-9\-_?\.]+/in
         | 
| 7 | 
            +
            		TEXTSNIPPET = /\(([^\)\\]|\\[\)\\]?)+\)/n
         | 
| 8 | 
            +
            		TJSNIPPET = /\((\\\(|\([^\(\)]*\)|[^\)\\]|\\[\)\\]?)*\)/n
         | 
| 9 | 
            +
            		IDENTIFIER = /^\/[\w:+\#\-\.]*(, ?[\w:+\#\-\.]*)*/n
         | 
| 10 | 
            +
            		WORD = /[a-z\.]{2,}/in
         | 
| 11 | 
            +
            	Productions
         | 
| 12 | 
            +
            		Target			->	'BT' Exprs 'ET'	[: _, values, _]
         | 
| 13 | 
            +
            		Exprs				->	Expr+ [^: values]
         | 
| 14 | 
            +
            		Expr				->	TmElement [^: val]
         | 
| 15 | 
            +
            								|		Array		[^: val]
         | 
| 16 | 
            +
            								|		TDElement [^: val]
         | 
| 17 | 
            +
            								|		TdElement [^: val]
         | 
| 18 | 
            +
            								|		TcElement [^: val]
         | 
| 19 | 
            +
            								|		TWElement [^: val]	
         | 
| 20 | 
            +
            								|		TfElement [^: val]
         | 
| 21 | 
            +
            								|		TjElement [^: val]
         | 
| 22 | 
            +
            								|		TrElement [^: val]
         | 
| 23 | 
            +
            							  |		TjHex			[^: val]
         | 
| 24 | 
            +
            								|		TStarElement [^: val]
         | 
| 25 | 
            +
            								|		TSElement [^: val]
         | 
| 26 | 
            +
            								|		TLElement [^: val]
         | 
| 27 | 
            +
            								|		TZElement	[^: val]
         | 
| 28 | 
            +
            								|		RgElement[^: val]
         | 
| 29 | 
            +
            								|		Apostroph [^: val]
         | 
| 30 | 
            +
            								|		Quotes [^: val]
         | 
| 31 | 
            +
                            |   LineWidth [^: val]
         | 
| 32 | 
            +
            								|		UElement  [^: val]
         | 
| 33 | 
            +
            		Array			->	'[' TJArrayElements /\]\s*TJ/n
         | 
| 34 | 
            +
            										 [: _, values, _]
         | 
| 35 | 
            +
            		TmElement		->	NUMERIC NUMERIC NUMERIC NUMERIC
         | 
| 36 | 
            +
            										NUMERIC NUMERIC 'Tm'
         | 
| 37 | 
            +
            										[Tm: xscale, alpha, beta, yscale, tmx, tmy, _]
         | 
| 38 | 
            +
            		TDElement		->	NUMERIC NUMERIC 'TD'
         | 
| 39 | 
            +
            										[TD: tdleadx, tdleady,_]
         | 
| 40 | 
            +
            		TdElement		->	NUMERIC NUMERIC 'Td'
         | 
| 41 | 
            +
            										[Td: xpos, ypos,_]
         | 
| 42 | 
            +
            		TcElement		->	NUMERIC 'Tc'
         | 
| 43 | 
            +
            										[Tc: charspace, _]
         | 
| 44 | 
            +
            		TWElement		->	NUMERIC 'Tw'
         | 
| 45 | 
            +
            										[TW: wordspace, _]
         | 
| 46 | 
            +
            		TfElement		->	'/' ALPHANUMERIC NUMERIC 'Tf' 
         | 
| 47 | 
            +
            										[Tf: _, fontname, fontsize, _]
         | 
| 48 | 
            +
            		TLElement		->	NUMERIC 'TL'
         | 
| 49 | 
            +
            										[TL: lead, _]
         | 
| 50 | 
            +
            		TjElement		->	TJSNIPPET 'Tj'
         | 
| 51 | 
            +
            										[Tj: snippet, _ ]
         | 
| 52 | 
            +
            		HexElement	->	'<' HEXSNIPPET '>'
         | 
| 53 | 
            +
            										[^: _, hex, _ ]
         | 
| 54 | 
            +
                            |   '<' '>'
         | 
| 55 | 
            +
            										[^: _, _ ]
         | 
| 56 | 
            +
            	  TjHex				->	HexElement 'Tj'
         | 
| 57 | 
            +
            										[Tjhex: hexsnippet, _ ]
         | 
| 58 | 
            +
            		TJArrayElements	->	TJSingleElement+
         | 
| 59 | 
            +
            												[^: values ]
         | 
| 60 | 
            +
            		TJSingleElement	->	TJSNIPPET
         | 
| 61 | 
            +
            												[: snippet ]
         | 
| 62 | 
            +
            										|		NUMERIC
         | 
| 63 | 
            +
            												[: kerning ]
         | 
| 64 | 
            +
            										|		HexElement
         | 
| 65 | 
            +
            												[: hexsnippet ]
         | 
| 66 | 
            +
                                |   '())'
         | 
| 67 | 
            +
            		TSElement		->	NUMERIC 'Ts'
         | 
| 68 | 
            +
            										[Ts: textrise]
         | 
| 69 | 
            +
            		TStarElement ->	'T*'
         | 
| 70 | 
            +
            										[TStar: linebreak ]
         | 
| 71 | 
            +
            		TZElement		->	NUMERIC 'Tz'
         | 
| 72 | 
            +
            										[Tz: hscaling]
         | 
| 73 | 
            +
            		RgElement		->	NUMERIC NUMERIC NUMERIC 'rg'
         | 
| 74 | 
            +
            										[Rg: _, _, _, _]
         | 
| 75 | 
            +
            		TrElement		->  NUMERIC 'Tr'
         | 
| 76 | 
            +
            										[Tr: render]
         | 
| 77 | 
            +
            		Apostroph		->	TJSNIPPET	"'"
         | 
| 78 | 
            +
            										[Apo: aposnippet, _]
         | 
| 79 | 
            +
            		Quotes			->	NUMERIC NUMERIC	TJSNIPPET '"'
         | 
| 80 | 
            +
            										[Quo: wordspace, charspace, aposnippet, _]
         | 
| 81 | 
            +
                LineWidth   ->  NUMERIC 'w'
         | 
| 82 | 
            +
            										[Width: width, _]
         | 
| 83 | 
            +
            		BTElement		->	/^BT$/n
         | 
| 84 | 
            +
            										[BT: _]
         | 
| 85 | 
            +
            		ETElement		->	/^ET$/n
         | 
| 86 | 
            +
            										[ET: _]
         | 
| 87 | 
            +
            		Hash				->	'<<' (IDENTIFIER HashExpr)* '>>'
         | 
| 88 | 
            +
            										[: _, pairs, _]
         | 
| 89 | 
            +
            		HashExpr		->	IDENTIFIER [^: val]
         | 
| 90 | 
            +
            								|		NUMERIC [^: val]
         | 
| 91 | 
            +
            								|		WORD [^: val]
         | 
| 92 | 
            +
            		UElement		->	NUMERIC NUMERIC NUMERIC NUMERIC 'k'
         | 
| 93 | 
            +
            								|		'/' ALPHANUMERIC /[gc]s/in
         | 
| 94 | 
            +
            								|		'/' ALPHANUMERIC Hash /BDC|BMC/
         | 
| 95 | 
            +
            								|		'EMC'
         | 
| 96 | 
            +
            								|		CNElements /scn?/in
         | 
| 97 | 
            +
            								|		NUMERIC /[gijJMG]/n
         | 
| 98 | 
            +
            								|		NUMERIC NUMERIC NUMERIC 'RG'
         | 
| 99 | 
            +
            								|		'[]' NUMERIC 'd'
         | 
| 100 | 
            +
                CNElements  ->  CNElement+
         | 
| 101 | 
            +
                CNElement   ->  ALPHANUMERIC
         | 
| 102 | 
            +
                            |   '/' ALPHANUMERIC ALPHANUMERIC?
         | 
    
        data/lib/rpdf2txt/parser.rb
    CHANGED
    
    
    
        metadata
    CHANGED
    
    | @@ -1,13 +1,13 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification 
         | 
| 2 2 | 
             
            name: rpdf2txt
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version 
         | 
| 4 | 
            -
              hash:  | 
| 4 | 
            +
              hash: 55
         | 
| 5 5 | 
             
              prerelease: 
         | 
| 6 6 | 
             
              segments: 
         | 
| 7 7 | 
             
              - 0
         | 
| 8 8 | 
             
              - 8
         | 
| 9 | 
            -
              -  | 
| 10 | 
            -
              version: 0.8. | 
| 9 | 
            +
              - 4
         | 
| 10 | 
            +
              version: 0.8.4
         | 
| 11 11 | 
             
            platform: ruby
         | 
| 12 12 | 
             
            authors: 
         | 
| 13 13 | 
             
            - Masaomi Hatakeyama, Zeno R.R. Davatz
         | 
| @@ -15,7 +15,7 @@ autorequire: | |
| 15 15 | 
             
            bindir: bin
         | 
| 16 16 | 
             
            cert_chain: []
         | 
| 17 17 |  | 
| 18 | 
            -
            date: 2012-01- | 
| 18 | 
            +
            date: 2012-01-06 00:00:00 +01:00
         | 
| 19 19 | 
             
            default_executable: 
         | 
| 20 20 | 
             
            dependencies: 
         | 
| 21 21 | 
             
            - !ruby/object:Gem::Dependency 
         | 
| @@ -50,8 +50,8 @@ extra_rdoc_files: | |
| 50 50 | 
             
            - usage-en.txt
         | 
| 51 51 | 
             
            - user-stories/UserStories_Rpdf2Txt.txt
         | 
| 52 52 | 
             
            files: 
         | 
| 53 | 
            -
            - LICENCE
         | 
| 54 53 | 
             
            - History.txt
         | 
| 54 | 
            +
            - LICENCE
         | 
| 55 55 | 
             
            - Manifest.txt
         | 
| 56 56 | 
             
            - README.txt
         | 
| 57 57 | 
             
            - Rakefile
         | 
| @@ -83,6 +83,10 @@ files: | |
| 83 83 | 
             
            - lib/rpdf2txt-rockit/version.rb
         | 
| 84 84 | 
             
            - lib/rpdf2txt/attributesparser.rb
         | 
| 85 85 | 
             
            - lib/rpdf2txt/cmapparser.rb
         | 
| 86 | 
            +
            - lib/rpdf2txt/data/_cmap.grammar
         | 
| 87 | 
            +
            - lib/rpdf2txt/data/_cmap_range.grammar
         | 
| 88 | 
            +
            - lib/rpdf2txt/data/_pdfattributes.grammar
         | 
| 89 | 
            +
            - lib/rpdf2txt/data/_pdftext.grammar
         | 
| 86 90 | 
             
            - lib/rpdf2txt/data/cmap.grammar
         | 
| 87 91 | 
             
            - lib/rpdf2txt/data/cmap.rb
         | 
| 88 92 | 
             
            - lib/rpdf2txt/data/cmap_range.grammar
         | 
| @@ -163,6 +167,7 @@ files: | |
| 163 167 | 
             
            - test/data/working_obj2
         | 
| 164 168 | 
             
            - test/mock.rb
         | 
| 165 169 | 
             
            - test/suite.rb
         | 
| 170 | 
            +
            - test/test_object.rb
         | 
| 166 171 | 
             
            - test/test_pdf_object.rb
         | 
| 167 172 | 
             
            - test/test_pdf_parser.rb
         | 
| 168 173 | 
             
            - test/test_pdf_text.rb
         | 
| @@ -172,7 +177,6 @@ files: | |
| 172 177 | 
             
            - usage-en.txt
         | 
| 173 178 | 
             
            - user-stories/UserStories_Rpdf2Txt.txt
         | 
| 174 179 | 
             
            - user-stories/documents/swissmedicjournal/04_2004.pdf
         | 
| 175 | 
            -
            - test/test_object.rb
         | 
| 176 180 | 
             
            - .gemtest
         | 
| 177 181 | 
             
            has_rdoc: true
         | 
| 178 182 | 
             
            homepage: http://scm.ywesee.com/?p=rpdf2txt/.git;a=summary
         |