docsplit 0.3.4 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,7 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'docsplit'
3
- s.version = '0.3.4' # Keep version in sync with docsplit.rb
4
- s.date = '2010-8-20'
3
+ s.version = '0.4.0' # Keep version in sync with docsplit.rb
4
+ s.date = '2010-8-23'
5
5
 
6
6
  s.homepage = "http://documentcloud.github.com/docsplit/"
7
7
  s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
@@ -1,7 +1,7 @@
1
1
  # The Docsplit module delegates to the Java PDF extractors.
2
2
  module Docsplit
3
3
 
4
- VERSION = '0.3.4' # Keep in sync with gemspec.
4
+ VERSION = '0.4.0' # Keep in sync with gemspec.
5
5
 
6
6
  ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
7
7
 
@@ -54,7 +54,8 @@ module Docsplit
54
54
  def self.extract_pdf(docs, opts={})
55
55
  [docs].flatten.each do |doc|
56
56
  basename = File.basename(doc, File.extname(doc))
57
- run "-jar #{ROOT}/vendor/jodconverter/jodconverter-cli-2.2.2.jar #{doc} #{opts[:output] || '.'}/#{basename}.pdf", [], {}
57
+ options = "-jar #{ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-3.jar -r #{ROOT}/vendor/conf/document-formats.js"
58
+ run "#{options} \"#{doc}\" \"#{opts[:output] || '.'}/#{basename}.pdf\"", [], {}
58
59
  end
59
60
  end
60
61
 
@@ -46,6 +46,7 @@ module Docsplit
46
46
  raise ExtractionFailed, result if $? != 0
47
47
  end
48
48
  end
49
+ ensure
49
50
  FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
50
51
  end
51
52
 
@@ -71,6 +71,7 @@ module Docsplit
71
71
  run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{tiff} 2>&1"
72
72
  run "tesseract #{tiff} #{base_path} -l eng 2>&1"
73
73
  end
74
+ ensure
74
75
  FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
75
76
  end
76
77
 
@@ -0,0 +1,236 @@
1
+ //
2
+ // JODConverter Document Formats Configuration
3
+ //
4
+ [
5
+ {
6
+ "name": "Portable Document Format",
7
+ "extension": "pdf",
8
+ "mediaType": "application/pdf",
9
+ "storePropertiesByFamily": {
10
+ "DRAWING": {"FilterName": "draw_pdf_Export"},
11
+ "SPREADSHEET": {"FilterName": "calc_pdf_Export"},
12
+ "PRESENTATION": {"FilterName": "impress_pdf_Export"},
13
+ "TEXT": {"FilterName": "writer_pdf_Export"}
14
+ }
15
+ },
16
+ {
17
+ "name": "Macromedia Flash",
18
+ "extension": "swf",
19
+ "mediaType": "application/x-shockwave-flash",
20
+ "storePropertiesByFamily": {
21
+ "DRAWING": {"FilterName": "draw_flash_Export"},
22
+ "PRESENTATION": {"FilterName": "impress_flash_Export"}
23
+ }
24
+ },
25
+ {
26
+ "name": "HTML",
27
+ "extension": "html",
28
+ "mediaType": "text/html",
29
+ "inputFamily": "TEXT",
30
+ "storePropertiesByFamily": {
31
+ "SPREADSHEET": {"FilterName": "HTML (StarCalc)"},
32
+ "PRESENTATION": {"FilterName": "impress_html_Export"},
33
+ "TEXT": {"FilterName": "HTML (StarWriter)"}
34
+ }
35
+ },
36
+ {
37
+ "name": "OpenDocument Text",
38
+ "extension": "odt",
39
+ "mediaType": "application/vnd.oasis.opendocument.text",
40
+ "inputFamily": "TEXT",
41
+ "storePropertiesByFamily": {"TEXT": {"FilterName": "writer8"}}
42
+ },
43
+ {
44
+ "name": "OpenOffice.org 1.0 Text Document",
45
+ "extension": "sxw",
46
+ "mediaType": "application/vnd.sun.xml.writer",
47
+ "inputFamily": "TEXT",
48
+ "storePropertiesByFamily": {"TEXT": {"FilterName": "StarOffice XML (Writer)"}}
49
+ },
50
+ {
51
+ "name": "Microsoft Word",
52
+ "extension": "doc",
53
+ "mediaType": "application/msword",
54
+ "inputFamily": "TEXT",
55
+ "storePropertiesByFamily": {"TEXT": {"FilterName": "MS Word 97"}}
56
+ },
57
+ {
58
+ "name": "Microsoft Word 2007 XML",
59
+ "extension": "docx",
60
+ "mediaType": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
61
+ "inputFamily": "TEXT"
62
+ },
63
+ {
64
+ "name": "Rich Text Format",
65
+ "extension": "rtf",
66
+ "mediaType": "text/rtf",
67
+ "inputFamily": "TEXT",
68
+ "storePropertiesByFamily": {"TEXT": {"FilterName": "Rich Text Format"}}
69
+ },
70
+ {
71
+ "name": "WordPerfect",
72
+ "extension": "wpd",
73
+ "mediaType": "application/wordperfect",
74
+ "inputFamily": "TEXT"
75
+ },
76
+ {
77
+ "name": "Plain Text",
78
+ "extension": "txt",
79
+ "mediaType": "text/plain",
80
+ "inputFamily": "TEXT",
81
+ "loadProperties": {
82
+ "FilterName": "Text (encoded)",
83
+ "FilterOptions": "utf8"
84
+ },
85
+ "storePropertiesByFamily": {"TEXT": {
86
+ "FilterName": "Text (encoded)",
87
+ "FilterOptions": "utf8"
88
+ }}
89
+ },
90
+ {
91
+ "name": "MediaWiki wikitext",
92
+ "extension": "wiki",
93
+ "mediaType": "text/x-wiki",
94
+ "storePropertiesByFamily": {"TEXT": {"FilterName": "MediaWiki"}}
95
+ },
96
+ {
97
+ "name": "OpenDocument Spreadsheet",
98
+ "extension": "ods",
99
+ "mediaType": "application/vnd.oasis.opendocument.spreadsheet",
100
+ "inputFamily": "SPREADSHEET",
101
+ "storePropertiesByFamily": {"SPREADSHEET": {"FilterName": "calc8"}}
102
+ },
103
+ {
104
+ "name": "OpenOffice.org 1.0 Spreadsheet",
105
+ "extension": "sxc",
106
+ "mediaType": "application/vnd.sun.xml.calc",
107
+ "inputFamily": "SPREADSHEET",
108
+ "storePropertiesByFamily": {"SPREADSHEET": {"FilterName": "StarOffice XML (Calc)"}}
109
+ },
110
+ {
111
+ "name": "Microsoft Excel",
112
+ "extension": "xls",
113
+ "mediaType": "application/vnd.ms-excel",
114
+ "inputFamily": "SPREADSHEET",
115
+ "storePropertiesByFamily": {"SPREADSHEET": {"FilterName": "MS Excel 97"}}
116
+ },
117
+ {
118
+ "name": "Microsoft Excel 2007 XML",
119
+ "extension": "xlsx",
120
+ "mediaType": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
121
+ "inputFamily": "SPREADSHEET"
122
+ },
123
+ {
124
+ "name": "Comma Separated Values",
125
+ "extension": "csv",
126
+ "mediaType": "text/csv",
127
+ "inputFamily": "SPREADSHEET",
128
+ "loadProperties": {
129
+ "FilterName": "Text - txt - csv (StarCalc)",
130
+ "FilterOptions": "44,34,0"
131
+ },
132
+ "storePropertiesByFamily": {"SPREADSHEET": {
133
+ "FilterName": "Text - txt - csv (StarCalc)",
134
+ "FilterOptions": "44,34,0"
135
+ }}
136
+ },
137
+ {
138
+ "name": "Tab Separated Values",
139
+ "extension": "tsv",
140
+ "mediaType": "text/tab-separated-values",
141
+ "inputFamily": "SPREADSHEET",
142
+ "loadProperties": {
143
+ "FilterName": "Text - txt - csv (StarCalc)",
144
+ "FilterOptions": "9,34,0"
145
+ },
146
+ "storePropertiesByFamily": {"SPREADSHEET": {
147
+ "FilterName": "Text - txt - csv (StarCalc)",
148
+ "FilterOptions": "9,34,0"
149
+ }}
150
+ },
151
+ {
152
+ "name": "OpenDocument Presentation",
153
+ "extension": "odp",
154
+ "mediaType": "application/vnd.oasis.opendocument.presentation",
155
+ "inputFamily": "PRESENTATION",
156
+ "storePropertiesByFamily": {"PRESENTATION": {"FilterName": "impress8"}}
157
+ },
158
+ {
159
+ "name": "OpenOffice.org 1.0 Presentation",
160
+ "extension": "sxi",
161
+ "mediaType": "application/vnd.sun.xml.impress",
162
+ "inputFamily": "PRESENTATION",
163
+ "storePropertiesByFamily": {"PRESENTATION": {"FilterName": "StarOffice XML (Impress)"}}
164
+ },
165
+ {
166
+ "name": "Microsoft PowerPoint",
167
+ "extension": "ppt",
168
+ "mediaType": "application/vnd.ms-powerpoint",
169
+ "inputFamily": "PRESENTATION",
170
+ "storePropertiesByFamily": {"PRESENTATION": {"FilterName": "MS PowerPoint 97"}}
171
+ },
172
+ {
173
+ "name": "Microsoft PowerPoint 2007 XML",
174
+ "extension": "pptx",
175
+ "mediaType": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
176
+ "inputFamily": "PRESENTATION"
177
+ },
178
+ {
179
+ "name": "OpenDocument Drawing",
180
+ "extension": "odg",
181
+ "mediaType": "application/vnd.oasis.opendocument.graphics",
182
+ "inputFamily": "DRAWING",
183
+ "storePropertiesByFamily": {"DRAWING": {"FilterName": "draw8"}}
184
+ },
185
+ {
186
+ "name": "Scalable Vector Graphics",
187
+ "extension": "svg",
188
+ "mediaType": "image/svg+xml",
189
+ "storePropertiesByFamily": {"DRAWING": {"FilterName": "draw_svg_Export"}}
190
+ },
191
+ {
192
+ "name": "Portable Network Graphic",
193
+ "extension": "png",
194
+ "mediaType": "image/png",
195
+ "storePropertiesByFamily": {
196
+ "DRAWING": {"FilterName": "draw_png_Export"},
197
+ "PRESENTATION": {"FilterName": "impress_png_Export"}
198
+ }
199
+ },
200
+ {
201
+ "name": "Graphics Interchange Format",
202
+ "extension": "gif",
203
+ "mediaType": "image/gif",
204
+ "storePropertiesByFamily": {
205
+ "DRAWING": {"FilterName": "draw_gif_Export"},
206
+ "PRESENTATION": {"FilterName": "impress_gif_Export"}
207
+ }
208
+ },
209
+ {
210
+ "name": "Joint Photographic Experts Group",
211
+ "extension": "jpg",
212
+ "mediaType": "image/jpeg",
213
+ "storePropertiesByFamily": {
214
+ "DRAWING": {"FilterName": "draw_jpg_Export"},
215
+ "PRESENTATION": {"FilterName": "impress_jpg_Export"}
216
+ }
217
+ },
218
+ {
219
+ "name": "Windows Bitmap",
220
+ "extension": "bmp",
221
+ "mediaType": "image/bmp",
222
+ "storePropertiesByFamily": {
223
+ "DRAWING": {"FilterName": "draw_bmp_Export"},
224
+ "PRESENTATION": {"FilterName": "impress_bmp_Export"}
225
+ }
226
+ },
227
+ {
228
+ "name": "Tagged Image File Format",
229
+ "extension": "tif",
230
+ "mediaType": "image/tiff",
231
+ "storePropertiesByFamily": {
232
+ "DRAWING": {"FilterName": "draw_tif_Export"},
233
+ "PRESENTATION": {"FilterName": "impress_tif_Export"}
234
+ }
235
+ }
236
+ ]
File without changes
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: docsplit
3
3
  version: !ruby/object:Gem::Version
4
- hash: 27
4
+ hash: 15
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
- - 3
9
8
  - 4
10
- version: 0.3.4
9
+ - 0
10
+ version: 0.4.0
11
11
  platform: ruby
12
12
  authors:
13
13
  - Jeremy Ashkenas
@@ -16,7 +16,7 @@ autorequire:
16
16
  bindir: bin
17
17
  cert_chain: []
18
18
 
19
- date: 2010-08-20 00:00:00 -04:00
19
+ date: 2010-08-23 00:00:00 -04:00
20
20
  default_executable:
21
21
  dependencies: []
22
22
 
@@ -37,16 +37,15 @@ files:
37
37
  - lib/docsplit/transparent_pdfs.rb
38
38
  - lib/docsplit.rb
39
39
  - bin/docsplit
40
- - vendor/jodconverter/commons-cli-1.2.jar
40
+ - vendor/conf/document-formats.js
41
+ - vendor/jodconverter/commons-cli-1.1.jar
41
42
  - vendor/jodconverter/commons-io-1.4.jar
42
- - vendor/jodconverter/jodconverter-2.2.2.jar
43
- - vendor/jodconverter/jodconverter-cli-2.2.2.jar
44
- - vendor/jodconverter/juh-3.0.1.jar
45
- - vendor/jodconverter/jurt-3.0.1.jar
46
- - vendor/jodconverter/ridl-3.0.1.jar
47
- - vendor/jodconverter/slf4j-api-1.5.6.jar
48
- - vendor/jodconverter/slf4j-jdk14-1.5.6.jar
49
- - vendor/jodconverter/unoil-3.0.1.jar
43
+ - vendor/jodconverter/jodconverter-core-3.0-beta-3.jar
44
+ - vendor/jodconverter/json-20080701.jar
45
+ - vendor/jodconverter/juh-3.1.0.jar
46
+ - vendor/jodconverter/jurt-3.1.0.jar
47
+ - vendor/jodconverter/ridl-3.1.0.jar
48
+ - vendor/jodconverter/unoil-3.1.0.jar
50
49
  - vendor/logging.properties
51
50
  - docsplit.gemspec
52
51
  - LICENSE