rpdf2txt 0.8.3 → 0.8.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,7 @@
1
+ === 0.8.4 / 06.01.2012
2
+
3
+ * Updated Manifest.txt to include the _*grammer_files
4
+
1
5
  === 0.8.3 / 05.01.2012
2
6
 
3
7
  * Getting ready for Ruby 1.9.3, partially, see README for more.
@@ -1,5 +1,5 @@
1
- LICENCE
2
1
  History.txt
2
+ LICENCE
3
3
  Manifest.txt
4
4
  README.txt
5
5
  Rakefile
@@ -31,6 +31,10 @@ lib/rpdf2txt-rockit/token.rb
31
31
  lib/rpdf2txt-rockit/version.rb
32
32
  lib/rpdf2txt/attributesparser.rb
33
33
  lib/rpdf2txt/cmapparser.rb
34
+ lib/rpdf2txt/data/_cmap.grammar
35
+ lib/rpdf2txt/data/_cmap_range.grammar
36
+ lib/rpdf2txt/data/_pdfattributes.grammar
37
+ lib/rpdf2txt/data/_pdftext.grammar
34
38
  lib/rpdf2txt/data/cmap.grammar
35
39
  lib/rpdf2txt/data/cmap.rb
36
40
  lib/rpdf2txt/data/cmap_range.grammar
@@ -111,6 +115,7 @@ test/data/working_obj
111
115
  test/data/working_obj2
112
116
  test/mock.rb
113
117
  test/suite.rb
118
+ test/test_object.rb
114
119
  test/test_pdf_object.rb
115
120
  test/test_pdf_parser.rb
116
121
  test/test_pdf_text.rb
@@ -0,0 +1,11 @@
1
+ Grammar CMap
2
+ Tokens
3
+ SPACE = /\s+/n [:Skip]
4
+ HEXSNIPPET = /[0-9A-F]+/in
5
+ Productions
6
+ HexArray -> RangeDef+
7
+ [^: values ]
8
+ RangeDef -> HexElement HexElement
9
+ [: source, target ]
10
+ HexElement -> '<' HEXSNIPPET '>'
11
+ [^: _, hexsnip, _ ]
@@ -0,0 +1,15 @@
1
+ Grammar CMap
2
+ Tokens
3
+ SPACE = /\s+/n [:Skip]
4
+ HEXSNIPPET = /[0-9A-F]+/in
5
+ Productions
6
+ HexArray -> RangeDef+
7
+ [^: values ]
8
+ RangeDef -> HexElement HexElement HexElement
9
+ [: start, stop, offset ]
10
+ | HexElement HexElement Explicit
11
+ [: start, stop, explicit ]
12
+ Explicit -> '[' HexElement+ ']'
13
+ [^: _, explicit, _ ]
14
+ HexElement -> '<' HEXSNIPPET '>'
15
+ [^: _, hexsnip, _ ]
@@ -0,0 +1,32 @@
1
+ Grammar PdfAttributes
2
+ Tokens
3
+ IDENTIFIER = /^\/[\w:+\#\-\.]*(, ?[\w:+\#\-\.]*)*/n
4
+ NUMERIC = /-?[0-9]+([.,][0-9]+)?/n
5
+ REFERENCE = /[0-9]+\s+[0-9]+\s+R/n
6
+ WORD = /[a-z\.]{2,}/in
7
+ SPACE = /\s+/mn [:Skip]
8
+ COMMENT = /%[^%]*%/mn [:Skip]
9
+ FILEIDENTIFIER = /<[a-zA-Z0-9\n]+>/n
10
+ Productions
11
+ Expr -> IDENTIFIER [^: val]
12
+ | Hash [^: val]
13
+ | Array [^: val]
14
+ | REFERENCE [^: val]
15
+ | NUMERIC [^: val]
16
+ | Text [^: val]
17
+ | Date [^: val]
18
+ | WORD [^: val]
19
+ | FILEIDENTIFIER [^: val]
20
+ Array -> '[' ArrayElements ']'
21
+ [: _, values, _]
22
+ | '[' ']'
23
+ [: _, _]
24
+ ArrayElements -> ArrayElement+
25
+ [^: values]
26
+ ArrayElement -> (Array|Hash|NUMERIC|IDENTIFIER|REFERENCE|WORD|FILEIDENTIFIER|Text)
27
+ [^: _]
28
+ Hash -> '<<' (IDENTIFIER Expr)* '>>'
29
+ [: _, pairs, _]
30
+ Date -> '(D:' /[\d+']+/n ')'
31
+ Text -> /\(([^\)\\]|\\[\(\)\\]?)*?\)/n
32
+ [: text]
@@ -0,0 +1,102 @@
1
+ Grammar PdfText
2
+ Tokens
3
+ NUMERIC = /-?(([0-9]*[.,_][0-9]+)|([0-9]+))/n
4
+ SPACE = /\s+/n [:Skip]
5
+ HEXSNIPPET = /[0-9A-F]+/in
6
+ ALPHANUMERIC = /[A-Z0-9\-_?\.]+/in
7
+ TEXTSNIPPET = /\(([^\)\\]|\\[\)\\]?)+\)/n
8
+ TJSNIPPET = /\((\\\(|\([^\(\)]*\)|[^\)\\]|\\[\)\\]?)*\)/n
9
+ IDENTIFIER = /^\/[\w:+\#\-\.]*(, ?[\w:+\#\-\.]*)*/n
10
+ WORD = /[a-z\.]{2,}/in
11
+ Productions
12
+ Target -> 'BT' Exprs 'ET' [: _, values, _]
13
+ Exprs -> Expr+ [^: values]
14
+ Expr -> TmElement [^: val]
15
+ | Array [^: val]
16
+ | TDElement [^: val]
17
+ | TdElement [^: val]
18
+ | TcElement [^: val]
19
+ | TWElement [^: val]
20
+ | TfElement [^: val]
21
+ | TjElement [^: val]
22
+ | TrElement [^: val]
23
+ | TjHex [^: val]
24
+ | TStarElement [^: val]
25
+ | TSElement [^: val]
26
+ | TLElement [^: val]
27
+ | TZElement [^: val]
28
+ | RgElement[^: val]
29
+ | Apostroph [^: val]
30
+ | Quotes [^: val]
31
+ | LineWidth [^: val]
32
+ | UElement [^: val]
33
+ Array -> '[' TJArrayElements /\]\s*TJ/n
34
+ [: _, values, _]
35
+ TmElement -> NUMERIC NUMERIC NUMERIC NUMERIC
36
+ NUMERIC NUMERIC 'Tm'
37
+ [Tm: xscale, alpha, beta, yscale, tmx, tmy, _]
38
+ TDElement -> NUMERIC NUMERIC 'TD'
39
+ [TD: tdleadx, tdleady,_]
40
+ TdElement -> NUMERIC NUMERIC 'Td'
41
+ [Td: xpos, ypos,_]
42
+ TcElement -> NUMERIC 'Tc'
43
+ [Tc: charspace, _]
44
+ TWElement -> NUMERIC 'Tw'
45
+ [TW: wordspace, _]
46
+ TfElement -> '/' ALPHANUMERIC NUMERIC 'Tf'
47
+ [Tf: _, fontname, fontsize, _]
48
+ TLElement -> NUMERIC 'TL'
49
+ [TL: lead, _]
50
+ TjElement -> TJSNIPPET 'Tj'
51
+ [Tj: snippet, _ ]
52
+ HexElement -> '<' HEXSNIPPET '>'
53
+ [^: _, hex, _ ]
54
+ | '<' '>'
55
+ [^: _, _ ]
56
+ TjHex -> HexElement 'Tj'
57
+ [Tjhex: hexsnippet, _ ]
58
+ TJArrayElements -> TJSingleElement+
59
+ [^: values ]
60
+ TJSingleElement -> TJSNIPPET
61
+ [: snippet ]
62
+ | NUMERIC
63
+ [: kerning ]
64
+ | HexElement
65
+ [: hexsnippet ]
66
+ | '())'
67
+ TSElement -> NUMERIC 'Ts'
68
+ [Ts: textrise]
69
+ TStarElement -> 'T*'
70
+ [TStar: linebreak ]
71
+ TZElement -> NUMERIC 'Tz'
72
+ [Tz: hscaling]
73
+ RgElement -> NUMERIC NUMERIC NUMERIC 'rg'
74
+ [Rg: _, _, _, _]
75
+ TrElement -> NUMERIC 'Tr'
76
+ [Tr: render]
77
+ Apostroph -> TJSNIPPET "'"
78
+ [Apo: aposnippet, _]
79
+ Quotes -> NUMERIC NUMERIC TJSNIPPET '"'
80
+ [Quo: wordspace, charspace, aposnippet, _]
81
+ LineWidth -> NUMERIC 'w'
82
+ [Width: width, _]
83
+ BTElement -> /^BT$/n
84
+ [BT: _]
85
+ ETElement -> /^ET$/n
86
+ [ET: _]
87
+ Hash -> '<<' (IDENTIFIER HashExpr)* '>>'
88
+ [: _, pairs, _]
89
+ HashExpr -> IDENTIFIER [^: val]
90
+ | NUMERIC [^: val]
91
+ | WORD [^: val]
92
+ UElement -> NUMERIC NUMERIC NUMERIC NUMERIC 'k'
93
+ | '/' ALPHANUMERIC /[gc]s/in
94
+ | '/' ALPHANUMERIC Hash /BDC|BMC/
95
+ | 'EMC'
96
+ | CNElements /scn?/in
97
+ | NUMERIC /[gijJMG]/n
98
+ | NUMERIC NUMERIC NUMERIC 'RG'
99
+ | '[]' NUMERIC 'd'
100
+ CNElements -> CNElement+
101
+ CNElement -> ALPHANUMERIC
102
+ | '/' ALPHANUMERIC ALPHANUMERIC?
@@ -29,7 +29,7 @@ require 'rpdf2txt/default_handler'
29
29
  require 'digest/md5'
30
30
 
31
31
  module Rpdf2txt
32
- VERSION = '0.8.3'
32
+ VERSION = '0.8.4'
33
33
  class Parser
34
34
  attr_accessor :encrypt
35
35
  def initialize(pdf_stream, target_encoding='utf8')
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rpdf2txt
3
3
  version: !ruby/object:Gem::Version
4
- hash: 57
4
+ hash: 55
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
8
  - 8
9
- - 3
10
- version: 0.8.3
9
+ - 4
10
+ version: 0.8.4
11
11
  platform: ruby
12
12
  authors:
13
13
  - Masaomi Hatakeyama, Zeno R.R. Davatz
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2012-01-05 00:00:00 +01:00
18
+ date: 2012-01-06 00:00:00 +01:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -50,8 +50,8 @@ extra_rdoc_files:
50
50
  - usage-en.txt
51
51
  - user-stories/UserStories_Rpdf2Txt.txt
52
52
  files:
53
- - LICENCE
54
53
  - History.txt
54
+ - LICENCE
55
55
  - Manifest.txt
56
56
  - README.txt
57
57
  - Rakefile
@@ -83,6 +83,10 @@ files:
83
83
  - lib/rpdf2txt-rockit/version.rb
84
84
  - lib/rpdf2txt/attributesparser.rb
85
85
  - lib/rpdf2txt/cmapparser.rb
86
+ - lib/rpdf2txt/data/_cmap.grammar
87
+ - lib/rpdf2txt/data/_cmap_range.grammar
88
+ - lib/rpdf2txt/data/_pdfattributes.grammar
89
+ - lib/rpdf2txt/data/_pdftext.grammar
86
90
  - lib/rpdf2txt/data/cmap.grammar
87
91
  - lib/rpdf2txt/data/cmap.rb
88
92
  - lib/rpdf2txt/data/cmap_range.grammar
@@ -163,6 +167,7 @@ files:
163
167
  - test/data/working_obj2
164
168
  - test/mock.rb
165
169
  - test/suite.rb
170
+ - test/test_object.rb
166
171
  - test/test_pdf_object.rb
167
172
  - test/test_pdf_parser.rb
168
173
  - test/test_pdf_text.rb
@@ -172,7 +177,6 @@ files:
172
177
  - usage-en.txt
173
178
  - user-stories/UserStories_Rpdf2Txt.txt
174
179
  - user-stories/documents/swissmedicjournal/04_2004.pdf
175
- - test/test_object.rb
176
180
  - .gemtest
177
181
  has_rdoc: true
178
182
  homepage: http://scm.ywesee.com/?p=rpdf2txt/.git;a=summary