docsplit 0.3.4 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/docsplit.gemspec +2 -2
- data/lib/docsplit.rb +3 -2
- data/lib/docsplit/image_extractor.rb +1 -0
- data/lib/docsplit/text_extractor.rb +1 -0
- data/vendor/conf/document-formats.js +236 -0
- data/vendor/jodconverter/commons-cli-1.1.jar +0 -0
- data/vendor/jodconverter/commons-io-1.4.jar +0 -0
- data/vendor/jodconverter/jodconverter-core-3.0-beta-3.jar +0 -0
- data/vendor/jodconverter/json-20080701.jar +0 -0
- data/vendor/jodconverter/{juh-3.0.1.jar → juh-3.1.0.jar} +0 -0
- data/vendor/jodconverter/{jurt-3.0.1.jar → jurt-3.1.0.jar} +0 -0
- data/vendor/jodconverter/ridl-3.1.0.jar +0 -0
- data/vendor/jodconverter/unoil-3.1.0.jar +0 -0
- metadata +12 -13
- data/vendor/jodconverter/commons-cli-1.2.jar +0 -0
- data/vendor/jodconverter/jodconverter-2.2.2.jar +0 -0
- data/vendor/jodconverter/jodconverter-cli-2.2.2.jar +0 -0
- data/vendor/jodconverter/ridl-3.0.1.jar +0 -0
- data/vendor/jodconverter/slf4j-api-1.5.6.jar +0 -0
- data/vendor/jodconverter/slf4j-jdk14-1.5.6.jar +0 -0
- data/vendor/jodconverter/unoil-3.0.1.jar +0 -0
data/docsplit.gemspec
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
Gem::Specification.new do |s|
|
2
2
|
s.name = 'docsplit'
|
3
|
-
s.version = '0.
|
4
|
-
s.date = '2010-8-
|
3
|
+
s.version = '0.4.0' # Keep version in sync with docsplit.rb
|
4
|
+
s.date = '2010-8-23'
|
5
5
|
|
6
6
|
s.homepage = "http://documentcloud.github.com/docsplit/"
|
7
7
|
s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
|
data/lib/docsplit.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# The Docsplit module delegates to the Java PDF extractors.
|
2
2
|
module Docsplit
|
3
3
|
|
4
|
-
VERSION = '0.
|
4
|
+
VERSION = '0.4.0' # Keep in sync with gemspec.
|
5
5
|
|
6
6
|
ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
|
7
7
|
|
@@ -54,7 +54,8 @@ module Docsplit
|
|
54
54
|
def self.extract_pdf(docs, opts={})
|
55
55
|
[docs].flatten.each do |doc|
|
56
56
|
basename = File.basename(doc, File.extname(doc))
|
57
|
-
|
57
|
+
options = "-jar #{ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-3.jar -r #{ROOT}/vendor/conf/document-formats.js"
|
58
|
+
run "#{options} \"#{doc}\" \"#{opts[:output] || '.'}/#{basename}.pdf\"", [], {}
|
58
59
|
end
|
59
60
|
end
|
60
61
|
|
@@ -71,6 +71,7 @@ module Docsplit
|
|
71
71
|
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{tiff} 2>&1"
|
72
72
|
run "tesseract #{tiff} #{base_path} -l eng 2>&1"
|
73
73
|
end
|
74
|
+
ensure
|
74
75
|
FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
|
75
76
|
end
|
76
77
|
|
@@ -0,0 +1,236 @@
|
|
1
|
+
//
|
2
|
+
// JODConverter Document Formats Configuration
|
3
|
+
//
|
4
|
+
[
|
5
|
+
{
|
6
|
+
"name": "Portable Document Format",
|
7
|
+
"extension": "pdf",
|
8
|
+
"mediaType": "application/pdf",
|
9
|
+
"storePropertiesByFamily": {
|
10
|
+
"DRAWING": {"FilterName": "draw_pdf_Export"},
|
11
|
+
"SPREADSHEET": {"FilterName": "calc_pdf_Export"},
|
12
|
+
"PRESENTATION": {"FilterName": "impress_pdf_Export"},
|
13
|
+
"TEXT": {"FilterName": "writer_pdf_Export"}
|
14
|
+
}
|
15
|
+
},
|
16
|
+
{
|
17
|
+
"name": "Macromedia Flash",
|
18
|
+
"extension": "swf",
|
19
|
+
"mediaType": "application/x-shockwave-flash",
|
20
|
+
"storePropertiesByFamily": {
|
21
|
+
"DRAWING": {"FilterName": "draw_flash_Export"},
|
22
|
+
"PRESENTATION": {"FilterName": "impress_flash_Export"}
|
23
|
+
}
|
24
|
+
},
|
25
|
+
{
|
26
|
+
"name": "HTML",
|
27
|
+
"extension": "html",
|
28
|
+
"mediaType": "text/html",
|
29
|
+
"inputFamily": "TEXT",
|
30
|
+
"storePropertiesByFamily": {
|
31
|
+
"SPREADSHEET": {"FilterName": "HTML (StarCalc)"},
|
32
|
+
"PRESENTATION": {"FilterName": "impress_html_Export"},
|
33
|
+
"TEXT": {"FilterName": "HTML (StarWriter)"}
|
34
|
+
}
|
35
|
+
},
|
36
|
+
{
|
37
|
+
"name": "OpenDocument Text",
|
38
|
+
"extension": "odt",
|
39
|
+
"mediaType": "application/vnd.oasis.opendocument.text",
|
40
|
+
"inputFamily": "TEXT",
|
41
|
+
"storePropertiesByFamily": {"TEXT": {"FilterName": "writer8"}}
|
42
|
+
},
|
43
|
+
{
|
44
|
+
"name": "OpenOffice.org 1.0 Text Document",
|
45
|
+
"extension": "sxw",
|
46
|
+
"mediaType": "application/vnd.sun.xml.writer",
|
47
|
+
"inputFamily": "TEXT",
|
48
|
+
"storePropertiesByFamily": {"TEXT": {"FilterName": "StarOffice XML (Writer)"}}
|
49
|
+
},
|
50
|
+
{
|
51
|
+
"name": "Microsoft Word",
|
52
|
+
"extension": "doc",
|
53
|
+
"mediaType": "application/msword",
|
54
|
+
"inputFamily": "TEXT",
|
55
|
+
"storePropertiesByFamily": {"TEXT": {"FilterName": "MS Word 97"}}
|
56
|
+
},
|
57
|
+
{
|
58
|
+
"name": "Microsoft Word 2007 XML",
|
59
|
+
"extension": "docx",
|
60
|
+
"mediaType": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
61
|
+
"inputFamily": "TEXT"
|
62
|
+
},
|
63
|
+
{
|
64
|
+
"name": "Rich Text Format",
|
65
|
+
"extension": "rtf",
|
66
|
+
"mediaType": "text/rtf",
|
67
|
+
"inputFamily": "TEXT",
|
68
|
+
"storePropertiesByFamily": {"TEXT": {"FilterName": "Rich Text Format"}}
|
69
|
+
},
|
70
|
+
{
|
71
|
+
"name": "WordPerfect",
|
72
|
+
"extension": "wpd",
|
73
|
+
"mediaType": "application/wordperfect",
|
74
|
+
"inputFamily": "TEXT"
|
75
|
+
},
|
76
|
+
{
|
77
|
+
"name": "Plain Text",
|
78
|
+
"extension": "txt",
|
79
|
+
"mediaType": "text/plain",
|
80
|
+
"inputFamily": "TEXT",
|
81
|
+
"loadProperties": {
|
82
|
+
"FilterName": "Text (encoded)",
|
83
|
+
"FilterOptions": "utf8"
|
84
|
+
},
|
85
|
+
"storePropertiesByFamily": {"TEXT": {
|
86
|
+
"FilterName": "Text (encoded)",
|
87
|
+
"FilterOptions": "utf8"
|
88
|
+
}}
|
89
|
+
},
|
90
|
+
{
|
91
|
+
"name": "MediaWiki wikitext",
|
92
|
+
"extension": "wiki",
|
93
|
+
"mediaType": "text/x-wiki",
|
94
|
+
"storePropertiesByFamily": {"TEXT": {"FilterName": "MediaWiki"}}
|
95
|
+
},
|
96
|
+
{
|
97
|
+
"name": "OpenDocument Spreadsheet",
|
98
|
+
"extension": "ods",
|
99
|
+
"mediaType": "application/vnd.oasis.opendocument.spreadsheet",
|
100
|
+
"inputFamily": "SPREADSHEET",
|
101
|
+
"storePropertiesByFamily": {"SPREADSHEET": {"FilterName": "calc8"}}
|
102
|
+
},
|
103
|
+
{
|
104
|
+
"name": "OpenOffice.org 1.0 Spreadsheet",
|
105
|
+
"extension": "sxc",
|
106
|
+
"mediaType": "application/vnd.sun.xml.calc",
|
107
|
+
"inputFamily": "SPREADSHEET",
|
108
|
+
"storePropertiesByFamily": {"SPREADSHEET": {"FilterName": "StarOffice XML (Calc)"}}
|
109
|
+
},
|
110
|
+
{
|
111
|
+
"name": "Microsoft Excel",
|
112
|
+
"extension": "xls",
|
113
|
+
"mediaType": "application/vnd.ms-excel",
|
114
|
+
"inputFamily": "SPREADSHEET",
|
115
|
+
"storePropertiesByFamily": {"SPREADSHEET": {"FilterName": "MS Excel 97"}}
|
116
|
+
},
|
117
|
+
{
|
118
|
+
"name": "Microsoft Excel 2007 XML",
|
119
|
+
"extension": "xlsx",
|
120
|
+
"mediaType": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
121
|
+
"inputFamily": "SPREADSHEET"
|
122
|
+
},
|
123
|
+
{
|
124
|
+
"name": "Comma Separated Values",
|
125
|
+
"extension": "csv",
|
126
|
+
"mediaType": "text/csv",
|
127
|
+
"inputFamily": "SPREADSHEET",
|
128
|
+
"loadProperties": {
|
129
|
+
"FilterName": "Text - txt - csv (StarCalc)",
|
130
|
+
"FilterOptions": "44,34,0"
|
131
|
+
},
|
132
|
+
"storePropertiesByFamily": {"SPREADSHEET": {
|
133
|
+
"FilterName": "Text - txt - csv (StarCalc)",
|
134
|
+
"FilterOptions": "44,34,0"
|
135
|
+
}}
|
136
|
+
},
|
137
|
+
{
|
138
|
+
"name": "Tab Separated Values",
|
139
|
+
"extension": "tsv",
|
140
|
+
"mediaType": "text/tab-separated-values",
|
141
|
+
"inputFamily": "SPREADSHEET",
|
142
|
+
"loadProperties": {
|
143
|
+
"FilterName": "Text - txt - csv (StarCalc)",
|
144
|
+
"FilterOptions": "9,34,0"
|
145
|
+
},
|
146
|
+
"storePropertiesByFamily": {"SPREADSHEET": {
|
147
|
+
"FilterName": "Text - txt - csv (StarCalc)",
|
148
|
+
"FilterOptions": "9,34,0"
|
149
|
+
}}
|
150
|
+
},
|
151
|
+
{
|
152
|
+
"name": "OpenDocument Presentation",
|
153
|
+
"extension": "odp",
|
154
|
+
"mediaType": "application/vnd.oasis.opendocument.presentation",
|
155
|
+
"inputFamily": "PRESENTATION",
|
156
|
+
"storePropertiesByFamily": {"PRESENTATION": {"FilterName": "impress8"}}
|
157
|
+
},
|
158
|
+
{
|
159
|
+
"name": "OpenOffice.org 1.0 Presentation",
|
160
|
+
"extension": "sxi",
|
161
|
+
"mediaType": "application/vnd.sun.xml.impress",
|
162
|
+
"inputFamily": "PRESENTATION",
|
163
|
+
"storePropertiesByFamily": {"PRESENTATION": {"FilterName": "StarOffice XML (Impress)"}}
|
164
|
+
},
|
165
|
+
{
|
166
|
+
"name": "Microsoft PowerPoint",
|
167
|
+
"extension": "ppt",
|
168
|
+
"mediaType": "application/vnd.ms-powerpoint",
|
169
|
+
"inputFamily": "PRESENTATION",
|
170
|
+
"storePropertiesByFamily": {"PRESENTATION": {"FilterName": "MS PowerPoint 97"}}
|
171
|
+
},
|
172
|
+
{
|
173
|
+
"name": "Microsoft PowerPoint 2007 XML",
|
174
|
+
"extension": "pptx",
|
175
|
+
"mediaType": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
176
|
+
"inputFamily": "PRESENTATION"
|
177
|
+
},
|
178
|
+
{
|
179
|
+
"name": "OpenDocument Drawing",
|
180
|
+
"extension": "odg",
|
181
|
+
"mediaType": "application/vnd.oasis.opendocument.graphics",
|
182
|
+
"inputFamily": "DRAWING",
|
183
|
+
"storePropertiesByFamily": {"DRAWING": {"FilterName": "draw8"}}
|
184
|
+
},
|
185
|
+
{
|
186
|
+
"name": "Scalable Vector Graphics",
|
187
|
+
"extension": "svg",
|
188
|
+
"mediaType": "image/svg+xml",
|
189
|
+
"storePropertiesByFamily": {"DRAWING": {"FilterName": "draw_svg_Export"}}
|
190
|
+
},
|
191
|
+
{
|
192
|
+
"name": "Portable Network Graphic",
|
193
|
+
"extension": "png",
|
194
|
+
"mediaType": "image/png",
|
195
|
+
"storePropertiesByFamily": {
|
196
|
+
"DRAWING": {"FilterName": "draw_png_Export"},
|
197
|
+
"PRESENTATION": {"FilterName": "impress_png_Export"}
|
198
|
+
}
|
199
|
+
},
|
200
|
+
{
|
201
|
+
"name": "Graphics Interchange Format",
|
202
|
+
"extension": "gif",
|
203
|
+
"mediaType": "image/gif",
|
204
|
+
"storePropertiesByFamily": {
|
205
|
+
"DRAWING": {"FilterName": "draw_gif_Export"},
|
206
|
+
"PRESENTATION": {"FilterName": "impress_gif_Export"}
|
207
|
+
}
|
208
|
+
},
|
209
|
+
{
|
210
|
+
"name": "Joint Photographic Experts Group",
|
211
|
+
"extension": "jpg",
|
212
|
+
"mediaType": "image/jpeg",
|
213
|
+
"storePropertiesByFamily": {
|
214
|
+
"DRAWING": {"FilterName": "draw_jpg_Export"},
|
215
|
+
"PRESENTATION": {"FilterName": "impress_jpg_Export"}
|
216
|
+
}
|
217
|
+
},
|
218
|
+
{
|
219
|
+
"name": "Windows Bitmap",
|
220
|
+
"extension": "bmp",
|
221
|
+
"mediaType": "image/bmp",
|
222
|
+
"storePropertiesByFamily": {
|
223
|
+
"DRAWING": {"FilterName": "draw_bmp_Export"},
|
224
|
+
"PRESENTATION": {"FilterName": "impress_bmp_Export"}
|
225
|
+
}
|
226
|
+
},
|
227
|
+
{
|
228
|
+
"name": "Tagged Image File Format",
|
229
|
+
"extension": "tif",
|
230
|
+
"mediaType": "image/tiff",
|
231
|
+
"storePropertiesByFamily": {
|
232
|
+
"DRAWING": {"FilterName": "draw_tif_Export"},
|
233
|
+
"PRESENTATION": {"FilterName": "impress_tif_Export"}
|
234
|
+
}
|
235
|
+
}
|
236
|
+
]
|
Binary file
|
File without changes
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: docsplit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 15
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
|
-
- 3
|
9
8
|
- 4
|
10
|
-
|
9
|
+
- 0
|
10
|
+
version: 0.4.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Jeremy Ashkenas
|
@@ -16,7 +16,7 @@ autorequire:
|
|
16
16
|
bindir: bin
|
17
17
|
cert_chain: []
|
18
18
|
|
19
|
-
date: 2010-08-
|
19
|
+
date: 2010-08-23 00:00:00 -04:00
|
20
20
|
default_executable:
|
21
21
|
dependencies: []
|
22
22
|
|
@@ -37,16 +37,15 @@ files:
|
|
37
37
|
- lib/docsplit/transparent_pdfs.rb
|
38
38
|
- lib/docsplit.rb
|
39
39
|
- bin/docsplit
|
40
|
-
- vendor/
|
40
|
+
- vendor/conf/document-formats.js
|
41
|
+
- vendor/jodconverter/commons-cli-1.1.jar
|
41
42
|
- vendor/jodconverter/commons-io-1.4.jar
|
42
|
-
- vendor/jodconverter/jodconverter-
|
43
|
-
- vendor/jodconverter/
|
44
|
-
- vendor/jodconverter/juh-3.0.
|
45
|
-
- vendor/jodconverter/jurt-3.0.
|
46
|
-
- vendor/jodconverter/ridl-3.0.
|
47
|
-
- vendor/jodconverter/
|
48
|
-
- vendor/jodconverter/slf4j-jdk14-1.5.6.jar
|
49
|
-
- vendor/jodconverter/unoil-3.0.1.jar
|
43
|
+
- vendor/jodconverter/jodconverter-core-3.0-beta-3.jar
|
44
|
+
- vendor/jodconverter/json-20080701.jar
|
45
|
+
- vendor/jodconverter/juh-3.1.0.jar
|
46
|
+
- vendor/jodconverter/jurt-3.1.0.jar
|
47
|
+
- vendor/jodconverter/ridl-3.1.0.jar
|
48
|
+
- vendor/jodconverter/unoil-3.1.0.jar
|
50
49
|
- vendor/logging.properties
|
51
50
|
- docsplit.gemspec
|
52
51
|
- LICENSE
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|