plaintext 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (33) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +11 -0
  3. data/.rspec +2 -0
  4. data/.travis.yml +7 -0
  5. data/Gemfile +4 -0
  6. data/LICENSE +339 -0
  7. data/README.md +127 -0
  8. data/Rakefile +6 -0
  9. data/bin/console +14 -0
  10. data/bin/setup +7 -0
  11. data/lib/plaintext/codeset_util.rb +27 -0
  12. data/lib/plaintext/configuration.rb +27 -0
  13. data/lib/plaintext/file_handler/external_command_handler/doc_handler.rb +17 -0
  14. data/lib/plaintext/file_handler/external_command_handler/image_handler.rb +18 -0
  15. data/lib/plaintext/file_handler/external_command_handler/pdf_handler.rb +13 -0
  16. data/lib/plaintext/file_handler/external_command_handler/ppt_handler.rb +17 -0
  17. data/lib/plaintext/file_handler/external_command_handler/rtf_handler.rb +13 -0
  18. data/lib/plaintext/file_handler/external_command_handler/xls_handler.rb +22 -0
  19. data/lib/plaintext/file_handler/external_command_handler.rb +43 -0
  20. data/lib/plaintext/file_handler/plaintext_handler.rb +14 -0
  21. data/lib/plaintext/file_handler/zipped_xml_handler/office_document_handler/docx_handler.rb +12 -0
  22. data/lib/plaintext/file_handler/zipped_xml_handler/office_document_handler/pptx_handler.rb +30 -0
  23. data/lib/plaintext/file_handler/zipped_xml_handler/office_document_handler/xlsx_handler.rb +12 -0
  24. data/lib/plaintext/file_handler/zipped_xml_handler/office_document_handler.rb +11 -0
  25. data/lib/plaintext/file_handler/zipped_xml_handler/opendocument_handler.rb +22 -0
  26. data/lib/plaintext/file_handler/zipped_xml_handler.rb +58 -0
  27. data/lib/plaintext/file_handler.rb +15 -0
  28. data/lib/plaintext/resolver.rb +48 -0
  29. data/lib/plaintext/version.rb +5 -0
  30. data/lib/plaintext.rb +28 -0
  31. data/plaintext.gemspec +29 -0
  32. data/plaintext.yml.example +41 -0
  33. metadata +162 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: a474b4e8e63d06f2d83a97a7ff09c3b3375b27ed
4
+ data.tar.gz: 2cc7395ee7ab12c5588b9c187823ba7e126d91bd
5
+ SHA512:
6
+ metadata.gz: 7afd62607f1f46c95278952591f6bda1325ea153b1006dc0092cd0128b615ffa4a11668315084c327541fca5591e5195d30c2635e51ca7a4391c67bd89df8b8d
7
+ data.tar.gz: a64aa765178280422de0a05d201ee11b2779649184ff2d72572e223c2e1bf160ce1c0f12b256e59c8ae2b102bcba699e7fd051205326c80d8dfd51b6690aae65
data/.gitignore ADDED
@@ -0,0 +1,11 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
10
+ .ruby-version
11
+ .idea
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --format documentation
2
+ --color
data/.travis.yml ADDED
@@ -0,0 +1,7 @@
1
+ language: ruby
2
+ rvm:
3
+ - 2.2.3
4
+ before_install:
5
+ - sudo apt-get -qq update
6
+ - sudo apt-get install -y catdoc unrtf poppler-utils tesseract-ocr
7
+ - gem install bundler -v 1.10.6
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in plaintext.gemspec
4
+ gemspec
data/LICENSE ADDED
@@ -0,0 +1,339 @@
1
+ GNU GENERAL PUBLIC LICENSE
2
+ Version 2, June 1991
3
+
4
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
5
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
6
+ Everyone is permitted to copy and distribute verbatim copies
7
+ of this license document, but changing it is not allowed.
8
+
9
+ Preamble
10
+
11
+ The licenses for most software are designed to take away your
12
+ freedom to share and change it. By contrast, the GNU General Public
13
+ License is intended to guarantee your freedom to share and change free
14
+ software--to make sure the software is free for all its users. This
15
+ General Public License applies to most of the Free Software
16
+ Foundation's software and to any other program whose authors commit to
17
+ using it. (Some other Free Software Foundation software is covered by
18
+ the GNU Lesser General Public License instead.) You can apply it to
19
+ your programs, too.
20
+
21
+ When we speak of free software, we are referring to freedom, not
22
+ price. Our General Public Licenses are designed to make sure that you
23
+ have the freedom to distribute copies of free software (and charge for
24
+ this service if you wish), that you receive source code or can get it
25
+ if you want it, that you can change the software or use pieces of it
26
+ in new free programs; and that you know you can do these things.
27
+
28
+ To protect your rights, we need to make restrictions that forbid
29
+ anyone to deny you these rights or to ask you to surrender the rights.
30
+ These restrictions translate to certain responsibilities for you if you
31
+ distribute copies of the software, or if you modify it.
32
+
33
+ For example, if you distribute copies of such a program, whether
34
+ gratis or for a fee, you must give the recipients all the rights that
35
+ you have. You must make sure that they, too, receive or can get the
36
+ source code. And you must show them these terms so they know their
37
+ rights.
38
+
39
+ We protect your rights with two steps: (1) copyright the software, and
40
+ (2) offer you this license which gives you legal permission to copy,
41
+ distribute and/or modify the software.
42
+
43
+ Also, for each author's protection and ours, we want to make certain
44
+ that everyone understands that there is no warranty for this free
45
+ software. If the software is modified by someone else and passed on, we
46
+ want its recipients to know that what they have is not the original, so
47
+ that any problems introduced by others will not reflect on the original
48
+ authors' reputations.
49
+
50
+ Finally, any free program is threatened constantly by software
51
+ patents. We wish to avoid the danger that redistributors of a free
52
+ program will individually obtain patent licenses, in effect making the
53
+ program proprietary. To prevent this, we have made it clear that any
54
+ patent must be licensed for everyone's free use or not licensed at all.
55
+
56
+ The precise terms and conditions for copying, distribution and
57
+ modification follow.
58
+
59
+ GNU GENERAL PUBLIC LICENSE
60
+ TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
61
+
62
+ 0. This License applies to any program or other work which contains
63
+ a notice placed by the copyright holder saying it may be distributed
64
+ under the terms of this General Public License. The "Program", below,
65
+ refers to any such program or work, and a "work based on the Program"
66
+ means either the Program or any derivative work under copyright law:
67
+ that is to say, a work containing the Program or a portion of it,
68
+ either verbatim or with modifications and/or translated into another
69
+ language. (Hereinafter, translation is included without limitation in
70
+ the term "modification".) Each licensee is addressed as "you".
71
+
72
+ Activities other than copying, distribution and modification are not
73
+ covered by this License; they are outside its scope. The act of
74
+ running the Program is not restricted, and the output from the Program
75
+ is covered only if its contents constitute a work based on the
76
+ Program (independent of having been made by running the Program).
77
+ Whether that is true depends on what the Program does.
78
+
79
+ 1. You may copy and distribute verbatim copies of the Program's
80
+ source code as you receive it, in any medium, provided that you
81
+ conspicuously and appropriately publish on each copy an appropriate
82
+ copyright notice and disclaimer of warranty; keep intact all the
83
+ notices that refer to this License and to the absence of any warranty;
84
+ and give any other recipients of the Program a copy of this License
85
+ along with the Program.
86
+
87
+ You may charge a fee for the physical act of transferring a copy, and
88
+ you may at your option offer warranty protection in exchange for a fee.
89
+
90
+ 2. You may modify your copy or copies of the Program or any portion
91
+ of it, thus forming a work based on the Program, and copy and
92
+ distribute such modifications or work under the terms of Section 1
93
+ above, provided that you also meet all of these conditions:
94
+
95
+ a) You must cause the modified files to carry prominent notices
96
+ stating that you changed the files and the date of any change.
97
+
98
+ b) You must cause any work that you distribute or publish, that in
99
+ whole or in part contains or is derived from the Program or any
100
+ part thereof, to be licensed as a whole at no charge to all third
101
+ parties under the terms of this License.
102
+
103
+ c) If the modified program normally reads commands interactively
104
+ when run, you must cause it, when started running for such
105
+ interactive use in the most ordinary way, to print or display an
106
+ announcement including an appropriate copyright notice and a
107
+ notice that there is no warranty (or else, saying that you provide
108
+ a warranty) and that users may redistribute the program under
109
+ these conditions, and telling the user how to view a copy of this
110
+ License. (Exception: if the Program itself is interactive but
111
+ does not normally print such an announcement, your work based on
112
+ the Program is not required to print an announcement.)
113
+
114
+ These requirements apply to the modified work as a whole. If
115
+ identifiable sections of that work are not derived from the Program,
116
+ and can be reasonably considered independent and separate works in
117
+ themselves, then this License, and its terms, do not apply to those
118
+ sections when you distribute them as separate works. But when you
119
+ distribute the same sections as part of a whole which is a work based
120
+ on the Program, the distribution of the whole must be on the terms of
121
+ this License, whose permissions for other licensees extend to the
122
+ entire whole, and thus to each and every part regardless of who wrote it.
123
+
124
+ Thus, it is not the intent of this section to claim rights or contest
125
+ your rights to work written entirely by you; rather, the intent is to
126
+ exercise the right to control the distribution of derivative or
127
+ collective works based on the Program.
128
+
129
+ In addition, mere aggregation of another work not based on the Program
130
+ with the Program (or with a work based on the Program) on a volume of
131
+ a storage or distribution medium does not bring the other work under
132
+ the scope of this License.
133
+
134
+ 3. You may copy and distribute the Program (or a work based on it,
135
+ under Section 2) in object code or executable form under the terms of
136
+ Sections 1 and 2 above provided that you also do one of the following:
137
+
138
+ a) Accompany it with the complete corresponding machine-readable
139
+ source code, which must be distributed under the terms of Sections
140
+ 1 and 2 above on a medium customarily used for software interchange; or,
141
+
142
+ b) Accompany it with a written offer, valid for at least three
143
+ years, to give any third party, for a charge no more than your
144
+ cost of physically performing source distribution, a complete
145
+ machine-readable copy of the corresponding source code, to be
146
+ distributed under the terms of Sections 1 and 2 above on a medium
147
+ customarily used for software interchange; or,
148
+
149
+ c) Accompany it with the information you received as to the offer
150
+ to distribute corresponding source code. (This alternative is
151
+ allowed only for noncommercial distribution and only if you
152
+ received the program in object code or executable form with such
153
+ an offer, in accord with Subsection b above.)
154
+
155
+ The source code for a work means the preferred form of the work for
156
+ making modifications to it. For an executable work, complete source
157
+ code means all the source code for all modules it contains, plus any
158
+ associated interface definition files, plus the scripts used to
159
+ control compilation and installation of the executable. However, as a
160
+ special exception, the source code distributed need not include
161
+ anything that is normally distributed (in either source or binary
162
+ form) with the major components (compiler, kernel, and so on) of the
163
+ operating system on which the executable runs, unless that component
164
+ itself accompanies the executable.
165
+
166
+ If distribution of executable or object code is made by offering
167
+ access to copy from a designated place, then offering equivalent
168
+ access to copy the source code from the same place counts as
169
+ distribution of the source code, even though third parties are not
170
+ compelled to copy the source along with the object code.
171
+
172
+ 4. You may not copy, modify, sublicense, or distribute the Program
173
+ except as expressly provided under this License. Any attempt
174
+ otherwise to copy, modify, sublicense or distribute the Program is
175
+ void, and will automatically terminate your rights under this License.
176
+ However, parties who have received copies, or rights, from you under
177
+ this License will not have their licenses terminated so long as such
178
+ parties remain in full compliance.
179
+
180
+ 5. You are not required to accept this License, since you have not
181
+ signed it. However, nothing else grants you permission to modify or
182
+ distribute the Program or its derivative works. These actions are
183
+ prohibited by law if you do not accept this License. Therefore, by
184
+ modifying or distributing the Program (or any work based on the
185
+ Program), you indicate your acceptance of this License to do so, and
186
+ all its terms and conditions for copying, distributing or modifying
187
+ the Program or works based on it.
188
+
189
+ 6. Each time you redistribute the Program (or any work based on the
190
+ Program), the recipient automatically receives a license from the
191
+ original licensor to copy, distribute or modify the Program subject to
192
+ these terms and conditions. You may not impose any further
193
+ restrictions on the recipients' exercise of the rights granted herein.
194
+ You are not responsible for enforcing compliance by third parties to
195
+ this License.
196
+
197
+ 7. If, as a consequence of a court judgment or allegation of patent
198
+ infringement or for any other reason (not limited to patent issues),
199
+ conditions are imposed on you (whether by court order, agreement or
200
+ otherwise) that contradict the conditions of this License, they do not
201
+ excuse you from the conditions of this License. If you cannot
202
+ distribute so as to satisfy simultaneously your obligations under this
203
+ License and any other pertinent obligations, then as a consequence you
204
+ may not distribute the Program at all. For example, if a patent
205
+ license would not permit royalty-free redistribution of the Program by
206
+ all those who receive copies directly or indirectly through you, then
207
+ the only way you could satisfy both it and this License would be to
208
+ refrain entirely from distribution of the Program.
209
+
210
+ If any portion of this section is held invalid or unenforceable under
211
+ any particular circumstance, the balance of the section is intended to
212
+ apply and the section as a whole is intended to apply in other
213
+ circumstances.
214
+
215
+ It is not the purpose of this section to induce you to infringe any
216
+ patents or other property right claims or to contest validity of any
217
+ such claims; this section has the sole purpose of protecting the
218
+ integrity of the free software distribution system, which is
219
+ implemented by public license practices. Many people have made
220
+ generous contributions to the wide range of software distributed
221
+ through that system in reliance on consistent application of that
222
+ system; it is up to the author/donor to decide if he or she is willing
223
+ to distribute software through any other system and a licensee cannot
224
+ impose that choice.
225
+
226
+ This section is intended to make thoroughly clear what is believed to
227
+ be a consequence of the rest of this License.
228
+
229
+ 8. If the distribution and/or use of the Program is restricted in
230
+ certain countries either by patents or by copyrighted interfaces, the
231
+ original copyright holder who places the Program under this License
232
+ may add an explicit geographical distribution limitation excluding
233
+ those countries, so that distribution is permitted only in or among
234
+ countries not thus excluded. In such case, this License incorporates
235
+ the limitation as if written in the body of this License.
236
+
237
+ 9. The Free Software Foundation may publish revised and/or new versions
238
+ of the General Public License from time to time. Such new versions will
239
+ be similar in spirit to the present version, but may differ in detail to
240
+ address new problems or concerns.
241
+
242
+ Each version is given a distinguishing version number. If the Program
243
+ specifies a version number of this License which applies to it and "any
244
+ later version", you have the option of following the terms and conditions
245
+ either of that version or of any later version published by the Free
246
+ Software Foundation. If the Program does not specify a version number of
247
+ this License, you may choose any version ever published by the Free Software
248
+ Foundation.
249
+
250
+ 10. If you wish to incorporate parts of the Program into other free
251
+ programs whose distribution conditions are different, write to the author
252
+ to ask for permission. For software which is copyrighted by the Free
253
+ Software Foundation, write to the Free Software Foundation; we sometimes
254
+ make exceptions for this. Our decision will be guided by the two goals
255
+ of preserving the free status of all derivatives of our free software and
256
+ of promoting the sharing and reuse of software generally.
257
+
258
+ NO WARRANTY
259
+
260
+ 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
261
+ FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
262
+ OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
263
+ PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
264
+ OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
265
+ MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
266
+ TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
267
+ PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
268
+ REPAIR OR CORRECTION.
269
+
270
+ 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
271
+ WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
272
+ REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
273
+ INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
274
+ OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
275
+ TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
276
+ YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
277
+ PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
278
+ POSSIBILITY OF SUCH DAMAGES.
279
+
280
+ END OF TERMS AND CONDITIONS
281
+
282
+ How to Apply These Terms to Your New Programs
283
+
284
+ If you develop a new program, and you want it to be of the greatest
285
+ possible use to the public, the best way to achieve this is to make it
286
+ free software which everyone can redistribute and change under these terms.
287
+
288
+ To do so, attach the following notices to the program. It is safest
289
+ to attach them to the start of each source file to most effectively
290
+ convey the exclusion of warranty; and each file should have at least
291
+ the "copyright" line and a pointer to where the full notice is found.
292
+
293
+ <one line to give the program's name and a brief idea of what it does.>
294
+ Copyright (C) <year> <name of author>
295
+
296
+ This program is free software; you can redistribute it and/or modify
297
+ it under the terms of the GNU General Public License as published by
298
+ the Free Software Foundation; either version 2 of the License, or
299
+ (at your option) any later version.
300
+
301
+ This program is distributed in the hope that it will be useful,
302
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
303
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
304
+ GNU General Public License for more details.
305
+
306
+ You should have received a copy of the GNU General Public License along
307
+ with this program; if not, write to the Free Software Foundation, Inc.,
308
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
309
+
310
+ Also add information on how to contact you by electronic and paper mail.
311
+
312
+ If the program is interactive, make it output a short notice like this
313
+ when it starts in an interactive mode:
314
+
315
+ Gnomovision version 69, Copyright (C) year name of author
316
+ Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
317
+ This is free software, and you are welcome to redistribute it
318
+ under certain conditions; type `show c' for details.
319
+
320
+ The hypothetical commands `show w' and `show c' should show the appropriate
321
+ parts of the General Public License. Of course, the commands you use may
322
+ be called something other than `show w' and `show c'; they could even be
323
+ mouse-clicks or menu items--whatever suits your program.
324
+
325
+ You should also get your employer (if you work as a programmer) or your
326
+ school, if any, to sign a "copyright disclaimer" for the program, if
327
+ necessary. Here is a sample; alter the names:
328
+
329
+ Yoyodyne, Inc., hereby disclaims all copyright interest in the program
330
+ `Gnomovision' (which makes passes at compilers) written by James Hacker.
331
+
332
+ <signature of Ty Coon>, 1 April 1989
333
+ Ty Coon, President of Vice
334
+
335
+ This General Public License does not permit incorporating your program into
336
+ proprietary programs. If your program is a subroutine library, you may
337
+ consider it more useful to permit linking proprietary applications with the
338
+ library. If this is what you want to do, use the GNU Lesser General
339
+ Public License instead of this License.
data/README.md ADDED
@@ -0,0 +1,127 @@
1
+ # plaintext
2
+
3
+ This gem wraps command line tools to extract plain text from typical files such as
4
+
5
+ - PDF
6
+ - RTF
7
+ - MS Office
8
+ - Word (doc, docx)
9
+ - Excel (xsl, xslx)
10
+ - PowerPoint (ppt, pptx)
11
+ - OpenOffice + Libre
12
+ - Presentation
13
+ - Text
14
+ - Spreadsheet
15
+ - Image files (png, jpeg, tiff), such as screenshots and scanned documents, through character recognition (OCR)
16
+ - Plaintext (txt)
17
+ - Comma-separated values (csv)
18
+
19
+ ## Acknowledgements
20
+
21
+ This gem bases on work by Jens Krämer / Planio, who originally provided it as a
22
+ [patch for Redmine](https://www.redmine.org/issues/306). Now, it is a collaborative effort of
23
+ both project management software providers [Planio](https://plan.io) and [OpenProject](https://openproject.org)
24
+ as both systems tackle the identical challenge to extract plain text from attachment files.
25
+
26
+ ## Installation
27
+
28
+ Add this line to your application's Gemfile:
29
+
30
+ ```ruby
31
+ gem 'plaintext'
32
+ ```
33
+
34
+ And then execute:
35
+
36
+ $ bundle
37
+
38
+ Or install it yourself as:
39
+
40
+ $ gem install plaintext
41
+
42
+ #### Rails
43
+
44
+ In a Rails application save `plaintext.yml.example` in `config/plaintext.yml` and overwrite the settings to
45
+ your needs.
46
+
47
+ Then load that configuration file in an initializer. Add the following lines to `config/initializers/plaintext.rb`:
48
+
49
+ ```ruby
50
+ file_name = File.join([Rails.root.to_s, 'config', 'plaintext'])
51
+ if File.file?(file_name)
52
+ config_file = File.read(file_name)
53
+ Plaintext::Configuration.load(config_file)
54
+ end
55
+ ````
56
+
57
+ #### Plain Ruby
58
+
59
+ Please overwrite `Plaintext::Configuration.load`.
60
+
61
+ ### Linux
62
+
63
+ On linux the default configuration should work. However, make sure that the following packages are installed
64
+
65
+ $ apt-get install catdoc unrtf poppler-utils tesseract-ocr
66
+
67
+ ### Mac OS X
68
+
69
+ On Mac things are still not complete. Please help us to have the same capabilities as under Linux. Right now we cannot
70
+ extract text from presentation and spreadsheets.
71
+
72
+ Please use homebrew to install the missing command line tools.
73
+
74
+ $ brew install unrtf poppler tesseract
75
+
76
+ The `plaintext.yml` should look like this:
77
+
78
+ ```yml
79
+ pdftotext:
80
+ - /usr/local/bin/pdftotext
81
+ - -enc
82
+ - UTF-8
83
+ - __FILE__
84
+ - '-'
85
+
86
+ unrtf:
87
+ - /usr/local/bin/unrtf
88
+ - --text
89
+ - __FILE__
90
+
91
+ tesseract:
92
+ - /usr/local/bin/tesseract
93
+ - __FILE__
94
+ - stdout
95
+
96
+ catdoc:
97
+ - /usr/bin/textutil
98
+ - -convert
99
+ - txt
100
+ - -stdout
101
+ - __FILE__
102
+ ```
103
+
104
+ ## Usage
105
+
106
+ ```ruby
107
+ # `file` is of type File.
108
+ # `content_type` is a String.
109
+ fulltext = Plaintext::Resolver.new(file, content_type).text
110
+ ```
111
+
112
+ ## License
113
+
114
+ The `plaintext` gem is free software; you can redistribute it and/or modify it under the terms of the GNU General
115
+ Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any
116
+ later version.
117
+
118
+ This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied
119
+ warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
120
+
121
+ You should have received a copy of the GNU General Public License along with the plugin. If not, see
122
+ [www.gnu.org/licenses](https://www.gnu.org/licenses/).
123
+
124
+ ## Contributing
125
+
126
+ Bug reports and pull requests are welcome on GitHub at https://github.com/planio-gmbh/plaintext.
127
+
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "plaintext"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start
data/bin/setup ADDED
@@ -0,0 +1,7 @@
1
+ #!/bin/bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+
5
+ bundle install
6
+
7
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,27 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Plaintext
4
+ module CodesetUtil
5
+ def self.to_utf8(str, encoding)
6
+ return str if str.nil?
7
+ str.force_encoding('ASCII-8BIT')
8
+ if str.empty?
9
+ str.force_encoding('UTF-8')
10
+ return str
11
+ end
12
+ enc = (encoding.nil? || encoding.size == 0) ? 'UTF-8' : encoding
13
+ if enc.upcase != 'UTF-8'
14
+ str.force_encoding(enc)
15
+ str = str.encode('UTF-8', invalid: :replace,
16
+ undef: :replace, replace: '?')
17
+ else
18
+ str.force_encoding('UTF-8')
19
+ if !str.valid_encoding?
20
+ str = str.encode('US-ASCII', invalid: :replace,
21
+ undef: :replace, replace: '?').encode('UTF-8')
22
+ end
23
+ end
24
+ str
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,27 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Plaintext
4
+ module Configuration
5
+ class << self
6
+ attr_accessor :config
7
+
8
+ # Returns a configuration setting
9
+ def [](name)
10
+ load if self.config.nil?
11
+ self.config[name]
12
+ end
13
+
14
+ def load(config_file = nil)
15
+ self.config = {}
16
+ return unless config_file
17
+
18
+ file_config = YAML::load(ERB.new(config_file).result)
19
+ if file_config.is_a?(Hash)
20
+ self.config = file_config
21
+ else
22
+ warn "`config_file` is not a valid Plaintext configuration file, ignoring."
23
+ end
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,17 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Plaintext
4
+ class DocHandler < ExternalCommandHandler
5
+ CONTENT_TYPES = [
6
+ 'application/vnd.ms-word',
7
+ 'application/msword'
8
+ ]
9
+ DEFAULT = [
10
+ '/usr/bin/catdoc', '-dutf-8', '__FILE__'
11
+ ]
12
+ def initialize
13
+ @content_types = CONTENT_TYPES
14
+ @command = Plaintext::Configuration['catdoc'] || DEFAULT
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,18 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Plaintext
4
+ class ImageHandler < ExternalCommandHandler
5
+ CONTENT_TYPES = [
6
+ 'image/jpeg',
7
+ 'image/png',
8
+ 'image/tiff'
9
+ ]
10
+ DEFAULT = [
11
+ '/usr/bin/tesseract', '__FILE__', 'stdout'
12
+ ].freeze
13
+ def initialize
14
+ @content_types = CONTENT_TYPES
15
+ @command = Plaintext::Configuration['tesseract'] || DEFAULT
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,13 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Plaintext
4
+ class PdfHandler < ExternalCommandHandler
5
+ DEFAULT = [
6
+ '/usr/bin/pdftotext', '-enc', 'UTF-8', '__FILE__', '-'
7
+ ].freeze
8
+ def initialize
9
+ @content_type = 'application/pdf'
10
+ @command = Plaintext::Configuration['pdftotext'] || DEFAULT
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,17 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Plaintext
4
+ class PptHandler < ExternalCommandHandler
5
+ CONTENT_TYPES = [
6
+ 'application/vnd.ms-powerpoint',
7
+ 'application/powerpoint',
8
+ ]
9
+ DEFAULT = [
10
+ '/usr/bin/catppt', '-dutf-8', '__FILE__'
11
+ ]
12
+ def initialize
13
+ @content_types = CONTENT_TYPES
14
+ @command = Plaintext::Configuration['catppt'] || DEFAULT
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,13 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Plaintext
4
+ class RtfHandler < ExternalCommandHandler
5
+ DEFAULT = [
6
+ '/usr/bin/unrtf', '--text', '__FILE__'
7
+ ].freeze
8
+ def initialize
9
+ @content_type = 'application/rtf'
10
+ @command = Plaintext::Configuration['unrtf'] || DEFAULT
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,22 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Plaintext
4
+ class XlsHandler < ExternalCommandHandler
5
+ CONTENT_TYPES = [
6
+ 'application/vnd.ms-excel',
7
+ 'application/excel'
8
+ ]
9
+ DEFAULT = [
10
+ '/usr/bin/xls2csv', '-dutf-8', '__FILE__'
11
+ ]
12
+ def initialize
13
+ @content_types = CONTENT_TYPES
14
+ @command = Plaintext::Configuration['xls2csv'] || DEFAULT
15
+ end
16
+ def text(*_)
17
+ if str = super
18
+ str.delete('"').gsub /,+/, ' '
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,43 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'pathname'
4
+
5
+ module Plaintext
6
+ class ExternalCommandHandler < FileHandler
7
+ # TODO: Extract this to a proper module
8
+ # Executes the given command through IO.popen and yields an IO object
9
+ # representing STDIN / STDOUT
10
+ #
11
+ # Due to how popen works the command will be executed directly without
12
+ # involving the shell if cmd is an array.
13
+ require 'fileutils'
14
+ def shellout(cmd, options = {}, &block)
15
+ mode = "r+"
16
+ IO.popen(cmd, mode) do |io|
17
+ io.set_encoding("ASCII-8BIT") if io.respond_to?(:set_encoding)
18
+ io.close_write unless options[:write_stdin]
19
+ block.call(io) if block_given?
20
+ end
21
+ end
22
+
23
+ FILE_PLACEHOLDER = '__FILE__'.freeze
24
+
25
+ def text(file)
26
+ cmd = @command.dup
27
+ cmd[cmd.index(FILE_PLACEHOLDER)] = Pathname(file).to_s
28
+ shellout(cmd){ |io| io.read }.to_s
29
+ end
30
+
31
+ def accept?(content_type)
32
+ super and available?
33
+ end
34
+
35
+ def available?
36
+ @command.present? and File.executable?(@command[0])
37
+ end
38
+
39
+ def self.available?
40
+ new.available?
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Plaintext
4
+ class PlaintextHandler < FileHandler
5
+ CONTENT_TYPES = %w(text/csv text/plain)
6
+ def initialize
7
+ @content_types = CONTENT_TYPES
8
+ end
9
+
10
+ def text(file)
11
+ Plaintext::CodesetUtil.to_utf8 IO.read(file), 'UTF-8'
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,12 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Plaintext
4
+ class DocxHandler < OfficeDocumentHandler
5
+ def initialize
6
+ super
7
+ @content_type = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
8
+ @file_name = 'word/document.xml'
9
+ @namespace_uri = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,30 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Plaintext
4
+ class PptxHandler < OfficeDocumentHandler
5
+ CONTENT_TYPES = [
6
+ 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
7
+ 'application/vnd.openxmlformats-officedocument.presentationml.slideshow',
8
+ 'application/vnd.ms-powerpoint.template.macroEnabled.12'
9
+ ]
10
+
11
+ def initialize
12
+ super
13
+ @content_types = CONTENT_TYPES
14
+ @namespace_uri = 'http://schemas.openxmlformats.org/drawingml/2006/main'
15
+ end
16
+
17
+ def text(file)
18
+ slides = []
19
+ Zip::File.open(file) do |zip_file|
20
+ zip_file.each do |entry|
21
+ if entry.name =~ /slide(\d+)\.xml/
22
+ slides << [$1, xml_to_text(entry.get_input_stream)]
23
+ end
24
+ end
25
+ end
26
+ slides.sort!{|a, b| a.first <=> b.first}
27
+ slides.map(&:last).join ' '
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,12 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Plaintext
4
+ class XlsxHandler < OfficeDocumentHandler
5
+ def initialize
6
+ super
7
+ @content_type = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
8
+ @file_name = 'xl/sharedStrings.xml'
9
+ @namespace_uri = 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,11 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Plaintext
4
+ # Base class for extractors for MS Office formats
5
+ class OfficeDocumentHandler < ZippedXmlHandler
6
+ def initialize
7
+ super
8
+ @element = 't'
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,22 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Plaintext
4
+ # Extractor for Open / Libre Office formats
5
+ class OpendocumentHandler < ZippedXmlHandler
6
+ CONTENT_TYPES = [
7
+ 'application/vnd.oasis.opendocument.presentation',
8
+ 'application/vnd.oasis.opendocument.presentation-template',
9
+ 'application/vnd.oasis.opendocument.text',
10
+ 'application/vnd.oasis.opendocument.text-template',
11
+ 'application/vnd.oasis.opendocument.spreadsheet',
12
+ 'application/vnd.oasis.opendocument.spreadsheet-template'
13
+ ]
14
+ def initialize
15
+ super
16
+ @file_name = 'content.xml'
17
+ @content_types = CONTENT_TYPES
18
+ @element = 'p'
19
+ @namespace_uri = 'urn:oasis:names:tc:opendocument:xmlns:text:1.0'
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,58 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Plaintext
4
+ # Handler base class for XML based (MS / Open / Libre) office documents.
5
+ class ZippedXmlHandler < FileHandler
6
+ require 'zip'
7
+ require 'nokogiri'
8
+
9
+ class SaxDocument < Nokogiri::XML::SAX::Document
10
+ attr_reader :text
11
+
12
+ def initialize(text_element, text_namespace)
13
+ @element = text_element
14
+ @namespace_uri = text_namespace
15
+ @text = ''.dup
16
+ @is_text = false
17
+ end
18
+
19
+ # Handle each element, expecting the name and any attributes
20
+ def start_element_namespace(name, attrs = [], prefix = nil, uri = nil, ns = [])
21
+ if name == @element and uri == @namespace_uri
22
+ @is_text = true
23
+ end
24
+ end
25
+
26
+ # Any characters between the start and end element expected as a string
27
+ def characters(string)
28
+ @text << string if @is_text
29
+ end
30
+
31
+ # Given the name of an element once its closing tag is reached
32
+ def end_element_namespace(name, prefix = nil, uri = nil)
33
+ if name == @element and uri == @namespace_uri
34
+ @text << ' '
35
+ @is_text = false
36
+ end
37
+ end
38
+ end
39
+
40
+ def text(file)
41
+ Zip::File.open(file) do |zip_file|
42
+ zip_file.each do |entry|
43
+ if entry.name == @file_name
44
+ return xml_to_text entry.get_input_stream
45
+ end
46
+ end
47
+ end
48
+ end
49
+
50
+ private
51
+
52
+ def xml_to_text(io)
53
+ sax_doc = SaxDocument.new @element, @namespace_uri
54
+ Nokogiri::XML::SAX::Parser.new(sax_doc).parse(io)
55
+ sax_doc.text
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Plaintext
4
+ class FileHandler
5
+ def accept?(content_type)
6
+ if @content_type
7
+ content_type == @content_type
8
+ elsif @content_types
9
+ @content_types.include? content_type
10
+ else
11
+ false
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,48 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Plaintext
4
+ class Resolver
5
+ MAX_FULLTEXT_LENGTH = 4_194_304 # 4 megabytes
6
+
7
+ class << self
8
+ attr_accessor :cached_file_handlers
9
+
10
+ HANDLERS = [
11
+ Plaintext::PdfHandler,
12
+ Plaintext::OpendocumentHandler,
13
+ Plaintext::DocxHandler, Plaintext::XlsxHandler, Plaintext::PptxHandler,
14
+ Plaintext::DocHandler, Plaintext::XlsHandler, Plaintext::PptHandler,
15
+ Plaintext::ImageHandler,
16
+ Plaintext::RtfHandler,
17
+ Plaintext::PlaintextHandler
18
+ ].freeze
19
+
20
+ def file_handlers
21
+ return self.cached_file_handlers if self.cached_file_handlers.present?
22
+ self.cached_file_handlers = HANDLERS.map(&:new)
23
+ end
24
+ end
25
+
26
+ def initialize(file, content_type = nil)
27
+ @file = file
28
+ @content_type = content_type
29
+ end
30
+
31
+ # Returns the extracted fulltext or nil if no matching handler was found
32
+ # for the file type.
33
+ def text
34
+ if handler = find_handler and text = handler.text(@file)
35
+ text.gsub! /\s+/m, ' '
36
+ text.strip!
37
+ text.mb_chars.compose.limit(MAX_FULLTEXT_LENGTH).to_s
38
+ end
39
+ end
40
+
41
+ private
42
+
43
+ def find_handler
44
+ self.class.file_handlers.detect { |h| h.accept? @content_type }
45
+ end
46
+
47
+ end
48
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Plaintext
4
+ VERSION = "0.1.0"
5
+ end
data/lib/plaintext.rb ADDED
@@ -0,0 +1,28 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'active_support/core_ext/string'
4
+
5
+ require 'plaintext/version'
6
+ require 'plaintext/configuration'
7
+
8
+ require 'plaintext/codeset_util'
9
+
10
+ require 'plaintext/file_handler'
11
+ require 'plaintext/file_handler/external_command_handler'
12
+ require 'plaintext/file_handler/external_command_handler/doc_handler'
13
+ require 'plaintext/file_handler/external_command_handler/image_handler'
14
+ require 'plaintext/file_handler/external_command_handler/pdf_handler'
15
+ require 'plaintext/file_handler/external_command_handler/ppt_handler'
16
+ require 'plaintext/file_handler/external_command_handler/rtf_handler'
17
+ require 'plaintext/file_handler/external_command_handler/xls_handler'
18
+
19
+ require 'plaintext/file_handler/zipped_xml_handler'
20
+ require 'plaintext/file_handler/zipped_xml_handler/office_document_handler'
21
+ require 'plaintext/file_handler/zipped_xml_handler/office_document_handler/docx_handler'
22
+ require 'plaintext/file_handler/zipped_xml_handler/office_document_handler/pptx_handler'
23
+ require 'plaintext/file_handler/zipped_xml_handler/office_document_handler/xlsx_handler'
24
+ require 'plaintext/file_handler/zipped_xml_handler/opendocument_handler'
25
+
26
+ require 'plaintext/file_handler/plaintext_handler'
27
+
28
+ require 'plaintext/resolver'
data/plaintext.gemspec ADDED
@@ -0,0 +1,29 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'plaintext/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "plaintext"
8
+ spec.version = Plaintext::VERSION
9
+ spec.authors = ['Jens Krämer', 'Planio GmbH', 'OpenProject GmbH']
10
+ spec.email = ['info@openproject.com']
11
+
12
+ spec.summary = 'Extract plain text from most common office documents.'
13
+ spec.description = "Extract text from common office files. Based on the file's content type a command line tool is selected to do the job."
14
+ spec.homepage = 'https://github.com/planio-gmbh/plaintext'
15
+ spec.license = 'GPL-2.0'
16
+
17
+ spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
18
+ spec.bindir = "exe"
19
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
20
+ spec.require_paths = ["lib"]
21
+
22
+ spec.add_dependency 'rubyzip', '~> 1.2.1'
23
+ spec.add_dependency 'nokogiri', '~> 1.8.1'
24
+ spec.add_dependency 'activesupport', '>2.2.1 '
25
+
26
+ spec.add_development_dependency "bundler", "~> 1.10"
27
+ spec.add_development_dependency "rake", "~> 10.0"
28
+ spec.add_development_dependency "rspec"
29
+ end
@@ -0,0 +1,41 @@
1
+ # This is an example configuration file. Copy this file to your application config folder. In a Rails application that
2
+ # would be `<Rails.root>/config/plaintext.yml`
3
+ #
4
+ # Text extraction helper programs.
5
+ #
6
+ # commands should write the resulting plain text to STDOUT. Use __FILE__ as
7
+ # placeholder for the file path. The values below are the defaults.
8
+
9
+ # apt-get install poppler-utils
10
+ # pdftotext:
11
+ # - /usr/bin/pdftotext
12
+ # - -enc
13
+ # - UTF-8
14
+ # - __FILE__
15
+ # - '-'
16
+
17
+ # apt-get install unrtf
18
+ # unrtf:
19
+ # - /usr/bin/unrtf
20
+ # - --text
21
+ # - __FILE__
22
+
23
+ # apt-get install catdoc
24
+ # catdoc:
25
+ # - /usr/bin/catdoc
26
+ # - -dutf-8
27
+ # - __FILE__
28
+ # xls2csv:
29
+ # - /usr/bin/xls2csv
30
+ # - -dutf-8
31
+ # - __FILE__
32
+ # catppt:
33
+ # - /usr/bin/catppt
34
+ # - -dutf-8
35
+ # - __FILE__
36
+
37
+ # apt-get install tesseract-ocr
38
+ # tesseract:
39
+ # - /usr/bin/tesseract
40
+ # - -dutf-8
41
+ # - __FILE__
metadata ADDED
@@ -0,0 +1,162 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: plaintext
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Jens Krämer
8
+ - Planio GmbH
9
+ - OpenProject GmbH
10
+ autorequire:
11
+ bindir: exe
12
+ cert_chain: []
13
+ date: 2018-02-15 00:00:00.000000000 Z
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: rubyzip
17
+ requirement: !ruby/object:Gem::Requirement
18
+ requirements:
19
+ - - "~>"
20
+ - !ruby/object:Gem::Version
21
+ version: 1.2.1
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ requirements:
26
+ - - "~>"
27
+ - !ruby/object:Gem::Version
28
+ version: 1.2.1
29
+ - !ruby/object:Gem::Dependency
30
+ name: nokogiri
31
+ requirement: !ruby/object:Gem::Requirement
32
+ requirements:
33
+ - - "~>"
34
+ - !ruby/object:Gem::Version
35
+ version: 1.8.1
36
+ type: :runtime
37
+ prerelease: false
38
+ version_requirements: !ruby/object:Gem::Requirement
39
+ requirements:
40
+ - - "~>"
41
+ - !ruby/object:Gem::Version
42
+ version: 1.8.1
43
+ - !ruby/object:Gem::Dependency
44
+ name: activesupport
45
+ requirement: !ruby/object:Gem::Requirement
46
+ requirements:
47
+ - - ">"
48
+ - !ruby/object:Gem::Version
49
+ version: 2.2.1
50
+ type: :runtime
51
+ prerelease: false
52
+ version_requirements: !ruby/object:Gem::Requirement
53
+ requirements:
54
+ - - ">"
55
+ - !ruby/object:Gem::Version
56
+ version: 2.2.1
57
+ - !ruby/object:Gem::Dependency
58
+ name: bundler
59
+ requirement: !ruby/object:Gem::Requirement
60
+ requirements:
61
+ - - "~>"
62
+ - !ruby/object:Gem::Version
63
+ version: '1.10'
64
+ type: :development
65
+ prerelease: false
66
+ version_requirements: !ruby/object:Gem::Requirement
67
+ requirements:
68
+ - - "~>"
69
+ - !ruby/object:Gem::Version
70
+ version: '1.10'
71
+ - !ruby/object:Gem::Dependency
72
+ name: rake
73
+ requirement: !ruby/object:Gem::Requirement
74
+ requirements:
75
+ - - "~>"
76
+ - !ruby/object:Gem::Version
77
+ version: '10.0'
78
+ type: :development
79
+ prerelease: false
80
+ version_requirements: !ruby/object:Gem::Requirement
81
+ requirements:
82
+ - - "~>"
83
+ - !ruby/object:Gem::Version
84
+ version: '10.0'
85
+ - !ruby/object:Gem::Dependency
86
+ name: rspec
87
+ requirement: !ruby/object:Gem::Requirement
88
+ requirements:
89
+ - - ">="
90
+ - !ruby/object:Gem::Version
91
+ version: '0'
92
+ type: :development
93
+ prerelease: false
94
+ version_requirements: !ruby/object:Gem::Requirement
95
+ requirements:
96
+ - - ">="
97
+ - !ruby/object:Gem::Version
98
+ version: '0'
99
+ description: Extract text from common office files. Based on the file's content type
100
+ a command line tool is selected to do the job.
101
+ email:
102
+ - info@openproject.com
103
+ executables: []
104
+ extensions: []
105
+ extra_rdoc_files: []
106
+ files:
107
+ - ".gitignore"
108
+ - ".rspec"
109
+ - ".travis.yml"
110
+ - Gemfile
111
+ - LICENSE
112
+ - README.md
113
+ - Rakefile
114
+ - bin/console
115
+ - bin/setup
116
+ - lib/plaintext.rb
117
+ - lib/plaintext/codeset_util.rb
118
+ - lib/plaintext/configuration.rb
119
+ - lib/plaintext/file_handler.rb
120
+ - lib/plaintext/file_handler/external_command_handler.rb
121
+ - lib/plaintext/file_handler/external_command_handler/doc_handler.rb
122
+ - lib/plaintext/file_handler/external_command_handler/image_handler.rb
123
+ - lib/plaintext/file_handler/external_command_handler/pdf_handler.rb
124
+ - lib/plaintext/file_handler/external_command_handler/ppt_handler.rb
125
+ - lib/plaintext/file_handler/external_command_handler/rtf_handler.rb
126
+ - lib/plaintext/file_handler/external_command_handler/xls_handler.rb
127
+ - lib/plaintext/file_handler/plaintext_handler.rb
128
+ - lib/plaintext/file_handler/zipped_xml_handler.rb
129
+ - lib/plaintext/file_handler/zipped_xml_handler/office_document_handler.rb
130
+ - lib/plaintext/file_handler/zipped_xml_handler/office_document_handler/docx_handler.rb
131
+ - lib/plaintext/file_handler/zipped_xml_handler/office_document_handler/pptx_handler.rb
132
+ - lib/plaintext/file_handler/zipped_xml_handler/office_document_handler/xlsx_handler.rb
133
+ - lib/plaintext/file_handler/zipped_xml_handler/opendocument_handler.rb
134
+ - lib/plaintext/resolver.rb
135
+ - lib/plaintext/version.rb
136
+ - plaintext.gemspec
137
+ - plaintext.yml.example
138
+ homepage: https://github.com/planio-gmbh/plaintext
139
+ licenses:
140
+ - GPL-2.0
141
+ metadata: {}
142
+ post_install_message:
143
+ rdoc_options: []
144
+ require_paths:
145
+ - lib
146
+ required_ruby_version: !ruby/object:Gem::Requirement
147
+ requirements:
148
+ - - ">="
149
+ - !ruby/object:Gem::Version
150
+ version: '0'
151
+ required_rubygems_version: !ruby/object:Gem::Requirement
152
+ requirements:
153
+ - - ">="
154
+ - !ruby/object:Gem::Version
155
+ version: '0'
156
+ requirements: []
157
+ rubyforge_project:
158
+ rubygems_version: 2.6.13
159
+ signing_key:
160
+ specification_version: 4
161
+ summary: Extract plain text from most common office documents.
162
+ test_files: []