rpdf2txt 0.8.3 → 0.8.4
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +4 -0
- data/Manifest.txt +6 -1
- data/lib/rpdf2txt/data/_cmap.grammar +11 -0
- data/lib/rpdf2txt/data/_cmap_range.grammar +15 -0
- data/lib/rpdf2txt/data/_pdfattributes.grammar +32 -0
- data/lib/rpdf2txt/data/_pdftext.grammar +102 -0
- data/lib/rpdf2txt/parser.rb +1 -1
- metadata +10 -6
data/History.txt
CHANGED
data/Manifest.txt
CHANGED
@@ -1,5 +1,5 @@
|
|
1
|
-
LICENCE
|
2
1
|
History.txt
|
2
|
+
LICENCE
|
3
3
|
Manifest.txt
|
4
4
|
README.txt
|
5
5
|
Rakefile
|
@@ -31,6 +31,10 @@ lib/rpdf2txt-rockit/token.rb
|
|
31
31
|
lib/rpdf2txt-rockit/version.rb
|
32
32
|
lib/rpdf2txt/attributesparser.rb
|
33
33
|
lib/rpdf2txt/cmapparser.rb
|
34
|
+
lib/rpdf2txt/data/_cmap.grammar
|
35
|
+
lib/rpdf2txt/data/_cmap_range.grammar
|
36
|
+
lib/rpdf2txt/data/_pdfattributes.grammar
|
37
|
+
lib/rpdf2txt/data/_pdftext.grammar
|
34
38
|
lib/rpdf2txt/data/cmap.grammar
|
35
39
|
lib/rpdf2txt/data/cmap.rb
|
36
40
|
lib/rpdf2txt/data/cmap_range.grammar
|
@@ -111,6 +115,7 @@ test/data/working_obj
|
|
111
115
|
test/data/working_obj2
|
112
116
|
test/mock.rb
|
113
117
|
test/suite.rb
|
118
|
+
test/test_object.rb
|
114
119
|
test/test_pdf_object.rb
|
115
120
|
test/test_pdf_parser.rb
|
116
121
|
test/test_pdf_text.rb
|
@@ -0,0 +1,15 @@
|
|
1
|
+
Grammar CMap
|
2
|
+
Tokens
|
3
|
+
SPACE = /\s+/n [:Skip]
|
4
|
+
HEXSNIPPET = /[0-9A-F]+/in
|
5
|
+
Productions
|
6
|
+
HexArray -> RangeDef+
|
7
|
+
[^: values ]
|
8
|
+
RangeDef -> HexElement HexElement HexElement
|
9
|
+
[: start, stop, offset ]
|
10
|
+
| HexElement HexElement Explicit
|
11
|
+
[: start, stop, explicit ]
|
12
|
+
Explicit -> '[' HexElement+ ']'
|
13
|
+
[^: _, explicit, _ ]
|
14
|
+
HexElement -> '<' HEXSNIPPET '>'
|
15
|
+
[^: _, hexsnip, _ ]
|
@@ -0,0 +1,32 @@
|
|
1
|
+
Grammar PdfAttributes
|
2
|
+
Tokens
|
3
|
+
IDENTIFIER = /^\/[\w:+\#\-\.]*(, ?[\w:+\#\-\.]*)*/n
|
4
|
+
NUMERIC = /-?[0-9]+([.,][0-9]+)?/n
|
5
|
+
REFERENCE = /[0-9]+\s+[0-9]+\s+R/n
|
6
|
+
WORD = /[a-z\.]{2,}/in
|
7
|
+
SPACE = /\s+/mn [:Skip]
|
8
|
+
COMMENT = /%[^%]*%/mn [:Skip]
|
9
|
+
FILEIDENTIFIER = /<[a-zA-Z0-9\n]+>/n
|
10
|
+
Productions
|
11
|
+
Expr -> IDENTIFIER [^: val]
|
12
|
+
| Hash [^: val]
|
13
|
+
| Array [^: val]
|
14
|
+
| REFERENCE [^: val]
|
15
|
+
| NUMERIC [^: val]
|
16
|
+
| Text [^: val]
|
17
|
+
| Date [^: val]
|
18
|
+
| WORD [^: val]
|
19
|
+
| FILEIDENTIFIER [^: val]
|
20
|
+
Array -> '[' ArrayElements ']'
|
21
|
+
[: _, values, _]
|
22
|
+
| '[' ']'
|
23
|
+
[: _, _]
|
24
|
+
ArrayElements -> ArrayElement+
|
25
|
+
[^: values]
|
26
|
+
ArrayElement -> (Array|Hash|NUMERIC|IDENTIFIER|REFERENCE|WORD|FILEIDENTIFIER|Text)
|
27
|
+
[^: _]
|
28
|
+
Hash -> '<<' (IDENTIFIER Expr)* '>>'
|
29
|
+
[: _, pairs, _]
|
30
|
+
Date -> '(D:' /[\d+']+/n ')'
|
31
|
+
Text -> /\(([^\)\\]|\\[\(\)\\]?)*?\)/n
|
32
|
+
[: text]
|
@@ -0,0 +1,102 @@
|
|
1
|
+
Grammar PdfText
|
2
|
+
Tokens
|
3
|
+
NUMERIC = /-?(([0-9]*[.,_][0-9]+)|([0-9]+))/n
|
4
|
+
SPACE = /\s+/n [:Skip]
|
5
|
+
HEXSNIPPET = /[0-9A-F]+/in
|
6
|
+
ALPHANUMERIC = /[A-Z0-9\-_?\.]+/in
|
7
|
+
TEXTSNIPPET = /\(([^\)\\]|\\[\)\\]?)+\)/n
|
8
|
+
TJSNIPPET = /\((\\\(|\([^\(\)]*\)|[^\)\\]|\\[\)\\]?)*\)/n
|
9
|
+
IDENTIFIER = /^\/[\w:+\#\-\.]*(, ?[\w:+\#\-\.]*)*/n
|
10
|
+
WORD = /[a-z\.]{2,}/in
|
11
|
+
Productions
|
12
|
+
Target -> 'BT' Exprs 'ET' [: _, values, _]
|
13
|
+
Exprs -> Expr+ [^: values]
|
14
|
+
Expr -> TmElement [^: val]
|
15
|
+
| Array [^: val]
|
16
|
+
| TDElement [^: val]
|
17
|
+
| TdElement [^: val]
|
18
|
+
| TcElement [^: val]
|
19
|
+
| TWElement [^: val]
|
20
|
+
| TfElement [^: val]
|
21
|
+
| TjElement [^: val]
|
22
|
+
| TrElement [^: val]
|
23
|
+
| TjHex [^: val]
|
24
|
+
| TStarElement [^: val]
|
25
|
+
| TSElement [^: val]
|
26
|
+
| TLElement [^: val]
|
27
|
+
| TZElement [^: val]
|
28
|
+
| RgElement[^: val]
|
29
|
+
| Apostroph [^: val]
|
30
|
+
| Quotes [^: val]
|
31
|
+
| LineWidth [^: val]
|
32
|
+
| UElement [^: val]
|
33
|
+
Array -> '[' TJArrayElements /\]\s*TJ/n
|
34
|
+
[: _, values, _]
|
35
|
+
TmElement -> NUMERIC NUMERIC NUMERIC NUMERIC
|
36
|
+
NUMERIC NUMERIC 'Tm'
|
37
|
+
[Tm: xscale, alpha, beta, yscale, tmx, tmy, _]
|
38
|
+
TDElement -> NUMERIC NUMERIC 'TD'
|
39
|
+
[TD: tdleadx, tdleady,_]
|
40
|
+
TdElement -> NUMERIC NUMERIC 'Td'
|
41
|
+
[Td: xpos, ypos,_]
|
42
|
+
TcElement -> NUMERIC 'Tc'
|
43
|
+
[Tc: charspace, _]
|
44
|
+
TWElement -> NUMERIC 'Tw'
|
45
|
+
[TW: wordspace, _]
|
46
|
+
TfElement -> '/' ALPHANUMERIC NUMERIC 'Tf'
|
47
|
+
[Tf: _, fontname, fontsize, _]
|
48
|
+
TLElement -> NUMERIC 'TL'
|
49
|
+
[TL: lead, _]
|
50
|
+
TjElement -> TJSNIPPET 'Tj'
|
51
|
+
[Tj: snippet, _ ]
|
52
|
+
HexElement -> '<' HEXSNIPPET '>'
|
53
|
+
[^: _, hex, _ ]
|
54
|
+
| '<' '>'
|
55
|
+
[^: _, _ ]
|
56
|
+
TjHex -> HexElement 'Tj'
|
57
|
+
[Tjhex: hexsnippet, _ ]
|
58
|
+
TJArrayElements -> TJSingleElement+
|
59
|
+
[^: values ]
|
60
|
+
TJSingleElement -> TJSNIPPET
|
61
|
+
[: snippet ]
|
62
|
+
| NUMERIC
|
63
|
+
[: kerning ]
|
64
|
+
| HexElement
|
65
|
+
[: hexsnippet ]
|
66
|
+
| '())'
|
67
|
+
TSElement -> NUMERIC 'Ts'
|
68
|
+
[Ts: textrise]
|
69
|
+
TStarElement -> 'T*'
|
70
|
+
[TStar: linebreak ]
|
71
|
+
TZElement -> NUMERIC 'Tz'
|
72
|
+
[Tz: hscaling]
|
73
|
+
RgElement -> NUMERIC NUMERIC NUMERIC 'rg'
|
74
|
+
[Rg: _, _, _, _]
|
75
|
+
TrElement -> NUMERIC 'Tr'
|
76
|
+
[Tr: render]
|
77
|
+
Apostroph -> TJSNIPPET "'"
|
78
|
+
[Apo: aposnippet, _]
|
79
|
+
Quotes -> NUMERIC NUMERIC TJSNIPPET '"'
|
80
|
+
[Quo: wordspace, charspace, aposnippet, _]
|
81
|
+
LineWidth -> NUMERIC 'w'
|
82
|
+
[Width: width, _]
|
83
|
+
BTElement -> /^BT$/n
|
84
|
+
[BT: _]
|
85
|
+
ETElement -> /^ET$/n
|
86
|
+
[ET: _]
|
87
|
+
Hash -> '<<' (IDENTIFIER HashExpr)* '>>'
|
88
|
+
[: _, pairs, _]
|
89
|
+
HashExpr -> IDENTIFIER [^: val]
|
90
|
+
| NUMERIC [^: val]
|
91
|
+
| WORD [^: val]
|
92
|
+
UElement -> NUMERIC NUMERIC NUMERIC NUMERIC 'k'
|
93
|
+
| '/' ALPHANUMERIC /[gc]s/in
|
94
|
+
| '/' ALPHANUMERIC Hash /BDC|BMC/
|
95
|
+
| 'EMC'
|
96
|
+
| CNElements /scn?/in
|
97
|
+
| NUMERIC /[gijJMG]/n
|
98
|
+
| NUMERIC NUMERIC NUMERIC 'RG'
|
99
|
+
| '[]' NUMERIC 'd'
|
100
|
+
CNElements -> CNElement+
|
101
|
+
CNElement -> ALPHANUMERIC
|
102
|
+
| '/' ALPHANUMERIC ALPHANUMERIC?
|
data/lib/rpdf2txt/parser.rb
CHANGED
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rpdf2txt
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 55
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 8
|
9
|
-
-
|
10
|
-
version: 0.8.
|
9
|
+
- 4
|
10
|
+
version: 0.8.4
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Masaomi Hatakeyama, Zeno R.R. Davatz
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2012-01-
|
18
|
+
date: 2012-01-06 00:00:00 +01:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -50,8 +50,8 @@ extra_rdoc_files:
|
|
50
50
|
- usage-en.txt
|
51
51
|
- user-stories/UserStories_Rpdf2Txt.txt
|
52
52
|
files:
|
53
|
-
- LICENCE
|
54
53
|
- History.txt
|
54
|
+
- LICENCE
|
55
55
|
- Manifest.txt
|
56
56
|
- README.txt
|
57
57
|
- Rakefile
|
@@ -83,6 +83,10 @@ files:
|
|
83
83
|
- lib/rpdf2txt-rockit/version.rb
|
84
84
|
- lib/rpdf2txt/attributesparser.rb
|
85
85
|
- lib/rpdf2txt/cmapparser.rb
|
86
|
+
- lib/rpdf2txt/data/_cmap.grammar
|
87
|
+
- lib/rpdf2txt/data/_cmap_range.grammar
|
88
|
+
- lib/rpdf2txt/data/_pdfattributes.grammar
|
89
|
+
- lib/rpdf2txt/data/_pdftext.grammar
|
86
90
|
- lib/rpdf2txt/data/cmap.grammar
|
87
91
|
- lib/rpdf2txt/data/cmap.rb
|
88
92
|
- lib/rpdf2txt/data/cmap_range.grammar
|
@@ -163,6 +167,7 @@ files:
|
|
163
167
|
- test/data/working_obj2
|
164
168
|
- test/mock.rb
|
165
169
|
- test/suite.rb
|
170
|
+
- test/test_object.rb
|
166
171
|
- test/test_pdf_object.rb
|
167
172
|
- test/test_pdf_parser.rb
|
168
173
|
- test/test_pdf_text.rb
|
@@ -172,7 +177,6 @@ files:
|
|
172
177
|
- usage-en.txt
|
173
178
|
- user-stories/UserStories_Rpdf2Txt.txt
|
174
179
|
- user-stories/documents/swissmedicjournal/04_2004.pdf
|
175
|
-
- test/test_object.rb
|
176
180
|
- .gemtest
|
177
181
|
has_rdoc: true
|
178
182
|
homepage: http://scm.ywesee.com/?p=rpdf2txt/.git;a=summary
|