rpdf2txt 0.8.3 → 0.8.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,3 +1,7 @@
1
+ === 0.8.4 / 06.01.2012
2
+
3
+ * Updated Manifest.txt to include the _*grammer_files
4
+
1
5
  === 0.8.3 / 05.01.2012
2
6
 
3
7
  * Getting ready for Ruby 1.9.3, partially, see README for more.
@@ -1,5 +1,5 @@
1
- LICENCE
2
1
  History.txt
2
+ LICENCE
3
3
  Manifest.txt
4
4
  README.txt
5
5
  Rakefile
@@ -31,6 +31,10 @@ lib/rpdf2txt-rockit/token.rb
31
31
  lib/rpdf2txt-rockit/version.rb
32
32
  lib/rpdf2txt/attributesparser.rb
33
33
  lib/rpdf2txt/cmapparser.rb
34
+ lib/rpdf2txt/data/_cmap.grammar
35
+ lib/rpdf2txt/data/_cmap_range.grammar
36
+ lib/rpdf2txt/data/_pdfattributes.grammar
37
+ lib/rpdf2txt/data/_pdftext.grammar
34
38
  lib/rpdf2txt/data/cmap.grammar
35
39
  lib/rpdf2txt/data/cmap.rb
36
40
  lib/rpdf2txt/data/cmap_range.grammar
@@ -111,6 +115,7 @@ test/data/working_obj
111
115
  test/data/working_obj2
112
116
  test/mock.rb
113
117
  test/suite.rb
118
+ test/test_object.rb
114
119
  test/test_pdf_object.rb
115
120
  test/test_pdf_parser.rb
116
121
  test/test_pdf_text.rb
@@ -0,0 +1,11 @@
1
+ Grammar CMap
2
+ Tokens
3
+ SPACE = /\s+/n [:Skip]
4
+ HEXSNIPPET = /[0-9A-F]+/in
5
+ Productions
6
+ HexArray -> RangeDef+
7
+ [^: values ]
8
+ RangeDef -> HexElement HexElement
9
+ [: source, target ]
10
+ HexElement -> '<' HEXSNIPPET '>'
11
+ [^: _, hexsnip, _ ]
@@ -0,0 +1,15 @@
1
+ Grammar CMap
2
+ Tokens
3
+ SPACE = /\s+/n [:Skip]
4
+ HEXSNIPPET = /[0-9A-F]+/in
5
+ Productions
6
+ HexArray -> RangeDef+
7
+ [^: values ]
8
+ RangeDef -> HexElement HexElement HexElement
9
+ [: start, stop, offset ]
10
+ | HexElement HexElement Explicit
11
+ [: start, stop, explicit ]
12
+ Explicit -> '[' HexElement+ ']'
13
+ [^: _, explicit, _ ]
14
+ HexElement -> '<' HEXSNIPPET '>'
15
+ [^: _, hexsnip, _ ]
@@ -0,0 +1,32 @@
1
+ Grammar PdfAttributes
2
+ Tokens
3
+ IDENTIFIER = /^\/[\w:+\#\-\.]*(, ?[\w:+\#\-\.]*)*/n
4
+ NUMERIC = /-?[0-9]+([.,][0-9]+)?/n
5
+ REFERENCE = /[0-9]+\s+[0-9]+\s+R/n
6
+ WORD = /[a-z\.]{2,}/in
7
+ SPACE = /\s+/mn [:Skip]
8
+ COMMENT = /%[^%]*%/mn [:Skip]
9
+ FILEIDENTIFIER = /<[a-zA-Z0-9\n]+>/n
10
+ Productions
11
+ Expr -> IDENTIFIER [^: val]
12
+ | Hash [^: val]
13
+ | Array [^: val]
14
+ | REFERENCE [^: val]
15
+ | NUMERIC [^: val]
16
+ | Text [^: val]
17
+ | Date [^: val]
18
+ | WORD [^: val]
19
+ | FILEIDENTIFIER [^: val]
20
+ Array -> '[' ArrayElements ']'
21
+ [: _, values, _]
22
+ | '[' ']'
23
+ [: _, _]
24
+ ArrayElements -> ArrayElement+
25
+ [^: values]
26
+ ArrayElement -> (Array|Hash|NUMERIC|IDENTIFIER|REFERENCE|WORD|FILEIDENTIFIER|Text)
27
+ [^: _]
28
+ Hash -> '<<' (IDENTIFIER Expr)* '>>'
29
+ [: _, pairs, _]
30
+ Date -> '(D:' /[\d+']+/n ')'
31
+ Text -> /\(([^\)\\]|\\[\(\)\\]?)*?\)/n
32
+ [: text]
@@ -0,0 +1,102 @@
1
+ Grammar PdfText
2
+ Tokens
3
+ NUMERIC = /-?(([0-9]*[.,_][0-9]+)|([0-9]+))/n
4
+ SPACE = /\s+/n [:Skip]
5
+ HEXSNIPPET = /[0-9A-F]+/in
6
+ ALPHANUMERIC = /[A-Z0-9\-_?\.]+/in
7
+ TEXTSNIPPET = /\(([^\)\\]|\\[\)\\]?)+\)/n
8
+ TJSNIPPET = /\((\\\(|\([^\(\)]*\)|[^\)\\]|\\[\)\\]?)*\)/n
9
+ IDENTIFIER = /^\/[\w:+\#\-\.]*(, ?[\w:+\#\-\.]*)*/n
10
+ WORD = /[a-z\.]{2,}/in
11
+ Productions
12
+ Target -> 'BT' Exprs 'ET' [: _, values, _]
13
+ Exprs -> Expr+ [^: values]
14
+ Expr -> TmElement [^: val]
15
+ | Array [^: val]
16
+ | TDElement [^: val]
17
+ | TdElement [^: val]
18
+ | TcElement [^: val]
19
+ | TWElement [^: val]
20
+ | TfElement [^: val]
21
+ | TjElement [^: val]
22
+ | TrElement [^: val]
23
+ | TjHex [^: val]
24
+ | TStarElement [^: val]
25
+ | TSElement [^: val]
26
+ | TLElement [^: val]
27
+ | TZElement [^: val]
28
+ | RgElement[^: val]
29
+ | Apostroph [^: val]
30
+ | Quotes [^: val]
31
+ | LineWidth [^: val]
32
+ | UElement [^: val]
33
+ Array -> '[' TJArrayElements /\]\s*TJ/n
34
+ [: _, values, _]
35
+ TmElement -> NUMERIC NUMERIC NUMERIC NUMERIC
36
+ NUMERIC NUMERIC 'Tm'
37
+ [Tm: xscale, alpha, beta, yscale, tmx, tmy, _]
38
+ TDElement -> NUMERIC NUMERIC 'TD'
39
+ [TD: tdleadx, tdleady,_]
40
+ TdElement -> NUMERIC NUMERIC 'Td'
41
+ [Td: xpos, ypos,_]
42
+ TcElement -> NUMERIC 'Tc'
43
+ [Tc: charspace, _]
44
+ TWElement -> NUMERIC 'Tw'
45
+ [TW: wordspace, _]
46
+ TfElement -> '/' ALPHANUMERIC NUMERIC 'Tf'
47
+ [Tf: _, fontname, fontsize, _]
48
+ TLElement -> NUMERIC 'TL'
49
+ [TL: lead, _]
50
+ TjElement -> TJSNIPPET 'Tj'
51
+ [Tj: snippet, _ ]
52
+ HexElement -> '<' HEXSNIPPET '>'
53
+ [^: _, hex, _ ]
54
+ | '<' '>'
55
+ [^: _, _ ]
56
+ TjHex -> HexElement 'Tj'
57
+ [Tjhex: hexsnippet, _ ]
58
+ TJArrayElements -> TJSingleElement+
59
+ [^: values ]
60
+ TJSingleElement -> TJSNIPPET
61
+ [: snippet ]
62
+ | NUMERIC
63
+ [: kerning ]
64
+ | HexElement
65
+ [: hexsnippet ]
66
+ | '())'
67
+ TSElement -> NUMERIC 'Ts'
68
+ [Ts: textrise]
69
+ TStarElement -> 'T*'
70
+ [TStar: linebreak ]
71
+ TZElement -> NUMERIC 'Tz'
72
+ [Tz: hscaling]
73
+ RgElement -> NUMERIC NUMERIC NUMERIC 'rg'
74
+ [Rg: _, _, _, _]
75
+ TrElement -> NUMERIC 'Tr'
76
+ [Tr: render]
77
+ Apostroph -> TJSNIPPET "'"
78
+ [Apo: aposnippet, _]
79
+ Quotes -> NUMERIC NUMERIC TJSNIPPET '"'
80
+ [Quo: wordspace, charspace, aposnippet, _]
81
+ LineWidth -> NUMERIC 'w'
82
+ [Width: width, _]
83
+ BTElement -> /^BT$/n
84
+ [BT: _]
85
+ ETElement -> /^ET$/n
86
+ [ET: _]
87
+ Hash -> '<<' (IDENTIFIER HashExpr)* '>>'
88
+ [: _, pairs, _]
89
+ HashExpr -> IDENTIFIER [^: val]
90
+ | NUMERIC [^: val]
91
+ | WORD [^: val]
92
+ UElement -> NUMERIC NUMERIC NUMERIC NUMERIC 'k'
93
+ | '/' ALPHANUMERIC /[gc]s/in
94
+ | '/' ALPHANUMERIC Hash /BDC|BMC/
95
+ | 'EMC'
96
+ | CNElements /scn?/in
97
+ | NUMERIC /[gijJMG]/n
98
+ | NUMERIC NUMERIC NUMERIC 'RG'
99
+ | '[]' NUMERIC 'd'
100
+ CNElements -> CNElement+
101
+ CNElement -> ALPHANUMERIC
102
+ | '/' ALPHANUMERIC ALPHANUMERIC?
@@ -29,7 +29,7 @@ require 'rpdf2txt/default_handler'
29
29
  require 'digest/md5'
30
30
 
31
31
  module Rpdf2txt
32
- VERSION = '0.8.3'
32
+ VERSION = '0.8.4'
33
33
  class Parser
34
34
  attr_accessor :encrypt
35
35
  def initialize(pdf_stream, target_encoding='utf8')
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rpdf2txt
3
3
  version: !ruby/object:Gem::Version
4
- hash: 57
4
+ hash: 55
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
8
  - 8
9
- - 3
10
- version: 0.8.3
9
+ - 4
10
+ version: 0.8.4
11
11
  platform: ruby
12
12
  authors:
13
13
  - Masaomi Hatakeyama, Zeno R.R. Davatz
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2012-01-05 00:00:00 +01:00
18
+ date: 2012-01-06 00:00:00 +01:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -50,8 +50,8 @@ extra_rdoc_files:
50
50
  - usage-en.txt
51
51
  - user-stories/UserStories_Rpdf2Txt.txt
52
52
  files:
53
- - LICENCE
54
53
  - History.txt
54
+ - LICENCE
55
55
  - Manifest.txt
56
56
  - README.txt
57
57
  - Rakefile
@@ -83,6 +83,10 @@ files:
83
83
  - lib/rpdf2txt-rockit/version.rb
84
84
  - lib/rpdf2txt/attributesparser.rb
85
85
  - lib/rpdf2txt/cmapparser.rb
86
+ - lib/rpdf2txt/data/_cmap.grammar
87
+ - lib/rpdf2txt/data/_cmap_range.grammar
88
+ - lib/rpdf2txt/data/_pdfattributes.grammar
89
+ - lib/rpdf2txt/data/_pdftext.grammar
86
90
  - lib/rpdf2txt/data/cmap.grammar
87
91
  - lib/rpdf2txt/data/cmap.rb
88
92
  - lib/rpdf2txt/data/cmap_range.grammar
@@ -163,6 +167,7 @@ files:
163
167
  - test/data/working_obj2
164
168
  - test/mock.rb
165
169
  - test/suite.rb
170
+ - test/test_object.rb
166
171
  - test/test_pdf_object.rb
167
172
  - test/test_pdf_parser.rb
168
173
  - test/test_pdf_text.rb
@@ -172,7 +177,6 @@ files:
172
177
  - usage-en.txt
173
178
  - user-stories/UserStories_Rpdf2Txt.txt
174
179
  - user-stories/documents/swissmedicjournal/04_2004.pdf
175
- - test/test_object.rb
176
180
  - .gemtest
177
181
  has_rdoc: true
178
182
  homepage: http://scm.ywesee.com/?p=rpdf2txt/.git;a=summary