plaintext 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +11 -0
- data/.rspec +2 -0
- data/.travis.yml +7 -0
- data/Gemfile +4 -0
- data/LICENSE +339 -0
- data/README.md +127 -0
- data/Rakefile +6 -0
- data/bin/console +14 -0
- data/bin/setup +7 -0
- data/lib/plaintext/codeset_util.rb +27 -0
- data/lib/plaintext/configuration.rb +27 -0
- data/lib/plaintext/file_handler/external_command_handler/doc_handler.rb +17 -0
- data/lib/plaintext/file_handler/external_command_handler/image_handler.rb +18 -0
- data/lib/plaintext/file_handler/external_command_handler/pdf_handler.rb +13 -0
- data/lib/plaintext/file_handler/external_command_handler/ppt_handler.rb +17 -0
- data/lib/plaintext/file_handler/external_command_handler/rtf_handler.rb +13 -0
- data/lib/plaintext/file_handler/external_command_handler/xls_handler.rb +22 -0
- data/lib/plaintext/file_handler/external_command_handler.rb +43 -0
- data/lib/plaintext/file_handler/plaintext_handler.rb +14 -0
- data/lib/plaintext/file_handler/zipped_xml_handler/office_document_handler/docx_handler.rb +12 -0
- data/lib/plaintext/file_handler/zipped_xml_handler/office_document_handler/pptx_handler.rb +30 -0
- data/lib/plaintext/file_handler/zipped_xml_handler/office_document_handler/xlsx_handler.rb +12 -0
- data/lib/plaintext/file_handler/zipped_xml_handler/office_document_handler.rb +11 -0
- data/lib/plaintext/file_handler/zipped_xml_handler/opendocument_handler.rb +22 -0
- data/lib/plaintext/file_handler/zipped_xml_handler.rb +58 -0
- data/lib/plaintext/file_handler.rb +15 -0
- data/lib/plaintext/resolver.rb +48 -0
- data/lib/plaintext/version.rb +5 -0
- data/lib/plaintext.rb +28 -0
- data/plaintext.gemspec +29 -0
- data/plaintext.yml.example +41 -0
- metadata +162 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: a474b4e8e63d06f2d83a97a7ff09c3b3375b27ed
|
4
|
+
data.tar.gz: 2cc7395ee7ab12c5588b9c187823ba7e126d91bd
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 7afd62607f1f46c95278952591f6bda1325ea153b1006dc0092cd0128b615ffa4a11668315084c327541fca5591e5195d30c2635e51ca7a4391c67bd89df8b8d
|
7
|
+
data.tar.gz: a64aa765178280422de0a05d201ee11b2779649184ff2d72572e223c2e1bf160ce1c0f12b256e59c8ae2b102bcba699e7fd051205326c80d8dfd51b6690aae65
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,339 @@
|
|
1
|
+
GNU GENERAL PUBLIC LICENSE
|
2
|
+
Version 2, June 1991
|
3
|
+
|
4
|
+
Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
|
5
|
+
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
6
|
+
Everyone is permitted to copy and distribute verbatim copies
|
7
|
+
of this license document, but changing it is not allowed.
|
8
|
+
|
9
|
+
Preamble
|
10
|
+
|
11
|
+
The licenses for most software are designed to take away your
|
12
|
+
freedom to share and change it. By contrast, the GNU General Public
|
13
|
+
License is intended to guarantee your freedom to share and change free
|
14
|
+
software--to make sure the software is free for all its users. This
|
15
|
+
General Public License applies to most of the Free Software
|
16
|
+
Foundation's software and to any other program whose authors commit to
|
17
|
+
using it. (Some other Free Software Foundation software is covered by
|
18
|
+
the GNU Lesser General Public License instead.) You can apply it to
|
19
|
+
your programs, too.
|
20
|
+
|
21
|
+
When we speak of free software, we are referring to freedom, not
|
22
|
+
price. Our General Public Licenses are designed to make sure that you
|
23
|
+
have the freedom to distribute copies of free software (and charge for
|
24
|
+
this service if you wish), that you receive source code or can get it
|
25
|
+
if you want it, that you can change the software or use pieces of it
|
26
|
+
in new free programs; and that you know you can do these things.
|
27
|
+
|
28
|
+
To protect your rights, we need to make restrictions that forbid
|
29
|
+
anyone to deny you these rights or to ask you to surrender the rights.
|
30
|
+
These restrictions translate to certain responsibilities for you if you
|
31
|
+
distribute copies of the software, or if you modify it.
|
32
|
+
|
33
|
+
For example, if you distribute copies of such a program, whether
|
34
|
+
gratis or for a fee, you must give the recipients all the rights that
|
35
|
+
you have. You must make sure that they, too, receive or can get the
|
36
|
+
source code. And you must show them these terms so they know their
|
37
|
+
rights.
|
38
|
+
|
39
|
+
We protect your rights with two steps: (1) copyright the software, and
|
40
|
+
(2) offer you this license which gives you legal permission to copy,
|
41
|
+
distribute and/or modify the software.
|
42
|
+
|
43
|
+
Also, for each author's protection and ours, we want to make certain
|
44
|
+
that everyone understands that there is no warranty for this free
|
45
|
+
software. If the software is modified by someone else and passed on, we
|
46
|
+
want its recipients to know that what they have is not the original, so
|
47
|
+
that any problems introduced by others will not reflect on the original
|
48
|
+
authors' reputations.
|
49
|
+
|
50
|
+
Finally, any free program is threatened constantly by software
|
51
|
+
patents. We wish to avoid the danger that redistributors of a free
|
52
|
+
program will individually obtain patent licenses, in effect making the
|
53
|
+
program proprietary. To prevent this, we have made it clear that any
|
54
|
+
patent must be licensed for everyone's free use or not licensed at all.
|
55
|
+
|
56
|
+
The precise terms and conditions for copying, distribution and
|
57
|
+
modification follow.
|
58
|
+
|
59
|
+
GNU GENERAL PUBLIC LICENSE
|
60
|
+
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
|
61
|
+
|
62
|
+
0. This License applies to any program or other work which contains
|
63
|
+
a notice placed by the copyright holder saying it may be distributed
|
64
|
+
under the terms of this General Public License. The "Program", below,
|
65
|
+
refers to any such program or work, and a "work based on the Program"
|
66
|
+
means either the Program or any derivative work under copyright law:
|
67
|
+
that is to say, a work containing the Program or a portion of it,
|
68
|
+
either verbatim or with modifications and/or translated into another
|
69
|
+
language. (Hereinafter, translation is included without limitation in
|
70
|
+
the term "modification".) Each licensee is addressed as "you".
|
71
|
+
|
72
|
+
Activities other than copying, distribution and modification are not
|
73
|
+
covered by this License; they are outside its scope. The act of
|
74
|
+
running the Program is not restricted, and the output from the Program
|
75
|
+
is covered only if its contents constitute a work based on the
|
76
|
+
Program (independent of having been made by running the Program).
|
77
|
+
Whether that is true depends on what the Program does.
|
78
|
+
|
79
|
+
1. You may copy and distribute verbatim copies of the Program's
|
80
|
+
source code as you receive it, in any medium, provided that you
|
81
|
+
conspicuously and appropriately publish on each copy an appropriate
|
82
|
+
copyright notice and disclaimer of warranty; keep intact all the
|
83
|
+
notices that refer to this License and to the absence of any warranty;
|
84
|
+
and give any other recipients of the Program a copy of this License
|
85
|
+
along with the Program.
|
86
|
+
|
87
|
+
You may charge a fee for the physical act of transferring a copy, and
|
88
|
+
you may at your option offer warranty protection in exchange for a fee.
|
89
|
+
|
90
|
+
2. You may modify your copy or copies of the Program or any portion
|
91
|
+
of it, thus forming a work based on the Program, and copy and
|
92
|
+
distribute such modifications or work under the terms of Section 1
|
93
|
+
above, provided that you also meet all of these conditions:
|
94
|
+
|
95
|
+
a) You must cause the modified files to carry prominent notices
|
96
|
+
stating that you changed the files and the date of any change.
|
97
|
+
|
98
|
+
b) You must cause any work that you distribute or publish, that in
|
99
|
+
whole or in part contains or is derived from the Program or any
|
100
|
+
part thereof, to be licensed as a whole at no charge to all third
|
101
|
+
parties under the terms of this License.
|
102
|
+
|
103
|
+
c) If the modified program normally reads commands interactively
|
104
|
+
when run, you must cause it, when started running for such
|
105
|
+
interactive use in the most ordinary way, to print or display an
|
106
|
+
announcement including an appropriate copyright notice and a
|
107
|
+
notice that there is no warranty (or else, saying that you provide
|
108
|
+
a warranty) and that users may redistribute the program under
|
109
|
+
these conditions, and telling the user how to view a copy of this
|
110
|
+
License. (Exception: if the Program itself is interactive but
|
111
|
+
does not normally print such an announcement, your work based on
|
112
|
+
the Program is not required to print an announcement.)
|
113
|
+
|
114
|
+
These requirements apply to the modified work as a whole. If
|
115
|
+
identifiable sections of that work are not derived from the Program,
|
116
|
+
and can be reasonably considered independent and separate works in
|
117
|
+
themselves, then this License, and its terms, do not apply to those
|
118
|
+
sections when you distribute them as separate works. But when you
|
119
|
+
distribute the same sections as part of a whole which is a work based
|
120
|
+
on the Program, the distribution of the whole must be on the terms of
|
121
|
+
this License, whose permissions for other licensees extend to the
|
122
|
+
entire whole, and thus to each and every part regardless of who wrote it.
|
123
|
+
|
124
|
+
Thus, it is not the intent of this section to claim rights or contest
|
125
|
+
your rights to work written entirely by you; rather, the intent is to
|
126
|
+
exercise the right to control the distribution of derivative or
|
127
|
+
collective works based on the Program.
|
128
|
+
|
129
|
+
In addition, mere aggregation of another work not based on the Program
|
130
|
+
with the Program (or with a work based on the Program) on a volume of
|
131
|
+
a storage or distribution medium does not bring the other work under
|
132
|
+
the scope of this License.
|
133
|
+
|
134
|
+
3. You may copy and distribute the Program (or a work based on it,
|
135
|
+
under Section 2) in object code or executable form under the terms of
|
136
|
+
Sections 1 and 2 above provided that you also do one of the following:
|
137
|
+
|
138
|
+
a) Accompany it with the complete corresponding machine-readable
|
139
|
+
source code, which must be distributed under the terms of Sections
|
140
|
+
1 and 2 above on a medium customarily used for software interchange; or,
|
141
|
+
|
142
|
+
b) Accompany it with a written offer, valid for at least three
|
143
|
+
years, to give any third party, for a charge no more than your
|
144
|
+
cost of physically performing source distribution, a complete
|
145
|
+
machine-readable copy of the corresponding source code, to be
|
146
|
+
distributed under the terms of Sections 1 and 2 above on a medium
|
147
|
+
customarily used for software interchange; or,
|
148
|
+
|
149
|
+
c) Accompany it with the information you received as to the offer
|
150
|
+
to distribute corresponding source code. (This alternative is
|
151
|
+
allowed only for noncommercial distribution and only if you
|
152
|
+
received the program in object code or executable form with such
|
153
|
+
an offer, in accord with Subsection b above.)
|
154
|
+
|
155
|
+
The source code for a work means the preferred form of the work for
|
156
|
+
making modifications to it. For an executable work, complete source
|
157
|
+
code means all the source code for all modules it contains, plus any
|
158
|
+
associated interface definition files, plus the scripts used to
|
159
|
+
control compilation and installation of the executable. However, as a
|
160
|
+
special exception, the source code distributed need not include
|
161
|
+
anything that is normally distributed (in either source or binary
|
162
|
+
form) with the major components (compiler, kernel, and so on) of the
|
163
|
+
operating system on which the executable runs, unless that component
|
164
|
+
itself accompanies the executable.
|
165
|
+
|
166
|
+
If distribution of executable or object code is made by offering
|
167
|
+
access to copy from a designated place, then offering equivalent
|
168
|
+
access to copy the source code from the same place counts as
|
169
|
+
distribution of the source code, even though third parties are not
|
170
|
+
compelled to copy the source along with the object code.
|
171
|
+
|
172
|
+
4. You may not copy, modify, sublicense, or distribute the Program
|
173
|
+
except as expressly provided under this License. Any attempt
|
174
|
+
otherwise to copy, modify, sublicense or distribute the Program is
|
175
|
+
void, and will automatically terminate your rights under this License.
|
176
|
+
However, parties who have received copies, or rights, from you under
|
177
|
+
this License will not have their licenses terminated so long as such
|
178
|
+
parties remain in full compliance.
|
179
|
+
|
180
|
+
5. You are not required to accept this License, since you have not
|
181
|
+
signed it. However, nothing else grants you permission to modify or
|
182
|
+
distribute the Program or its derivative works. These actions are
|
183
|
+
prohibited by law if you do not accept this License. Therefore, by
|
184
|
+
modifying or distributing the Program (or any work based on the
|
185
|
+
Program), you indicate your acceptance of this License to do so, and
|
186
|
+
all its terms and conditions for copying, distributing or modifying
|
187
|
+
the Program or works based on it.
|
188
|
+
|
189
|
+
6. Each time you redistribute the Program (or any work based on the
|
190
|
+
Program), the recipient automatically receives a license from the
|
191
|
+
original licensor to copy, distribute or modify the Program subject to
|
192
|
+
these terms and conditions. You may not impose any further
|
193
|
+
restrictions on the recipients' exercise of the rights granted herein.
|
194
|
+
You are not responsible for enforcing compliance by third parties to
|
195
|
+
this License.
|
196
|
+
|
197
|
+
7. If, as a consequence of a court judgment or allegation of patent
|
198
|
+
infringement or for any other reason (not limited to patent issues),
|
199
|
+
conditions are imposed on you (whether by court order, agreement or
|
200
|
+
otherwise) that contradict the conditions of this License, they do not
|
201
|
+
excuse you from the conditions of this License. If you cannot
|
202
|
+
distribute so as to satisfy simultaneously your obligations under this
|
203
|
+
License and any other pertinent obligations, then as a consequence you
|
204
|
+
may not distribute the Program at all. For example, if a patent
|
205
|
+
license would not permit royalty-free redistribution of the Program by
|
206
|
+
all those who receive copies directly or indirectly through you, then
|
207
|
+
the only way you could satisfy both it and this License would be to
|
208
|
+
refrain entirely from distribution of the Program.
|
209
|
+
|
210
|
+
If any portion of this section is held invalid or unenforceable under
|
211
|
+
any particular circumstance, the balance of the section is intended to
|
212
|
+
apply and the section as a whole is intended to apply in other
|
213
|
+
circumstances.
|
214
|
+
|
215
|
+
It is not the purpose of this section to induce you to infringe any
|
216
|
+
patents or other property right claims or to contest validity of any
|
217
|
+
such claims; this section has the sole purpose of protecting the
|
218
|
+
integrity of the free software distribution system, which is
|
219
|
+
implemented by public license practices. Many people have made
|
220
|
+
generous contributions to the wide range of software distributed
|
221
|
+
through that system in reliance on consistent application of that
|
222
|
+
system; it is up to the author/donor to decide if he or she is willing
|
223
|
+
to distribute software through any other system and a licensee cannot
|
224
|
+
impose that choice.
|
225
|
+
|
226
|
+
This section is intended to make thoroughly clear what is believed to
|
227
|
+
be a consequence of the rest of this License.
|
228
|
+
|
229
|
+
8. If the distribution and/or use of the Program is restricted in
|
230
|
+
certain countries either by patents or by copyrighted interfaces, the
|
231
|
+
original copyright holder who places the Program under this License
|
232
|
+
may add an explicit geographical distribution limitation excluding
|
233
|
+
those countries, so that distribution is permitted only in or among
|
234
|
+
countries not thus excluded. In such case, this License incorporates
|
235
|
+
the limitation as if written in the body of this License.
|
236
|
+
|
237
|
+
9. The Free Software Foundation may publish revised and/or new versions
|
238
|
+
of the General Public License from time to time. Such new versions will
|
239
|
+
be similar in spirit to the present version, but may differ in detail to
|
240
|
+
address new problems or concerns.
|
241
|
+
|
242
|
+
Each version is given a distinguishing version number. If the Program
|
243
|
+
specifies a version number of this License which applies to it and "any
|
244
|
+
later version", you have the option of following the terms and conditions
|
245
|
+
either of that version or of any later version published by the Free
|
246
|
+
Software Foundation. If the Program does not specify a version number of
|
247
|
+
this License, you may choose any version ever published by the Free Software
|
248
|
+
Foundation.
|
249
|
+
|
250
|
+
10. If you wish to incorporate parts of the Program into other free
|
251
|
+
programs whose distribution conditions are different, write to the author
|
252
|
+
to ask for permission. For software which is copyrighted by the Free
|
253
|
+
Software Foundation, write to the Free Software Foundation; we sometimes
|
254
|
+
make exceptions for this. Our decision will be guided by the two goals
|
255
|
+
of preserving the free status of all derivatives of our free software and
|
256
|
+
of promoting the sharing and reuse of software generally.
|
257
|
+
|
258
|
+
NO WARRANTY
|
259
|
+
|
260
|
+
11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
|
261
|
+
FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
|
262
|
+
OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
|
263
|
+
PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
|
264
|
+
OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
265
|
+
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
|
266
|
+
TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
|
267
|
+
PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
|
268
|
+
REPAIR OR CORRECTION.
|
269
|
+
|
270
|
+
12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
|
271
|
+
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
|
272
|
+
REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
|
273
|
+
INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
|
274
|
+
OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
|
275
|
+
TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
|
276
|
+
YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
|
277
|
+
PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
|
278
|
+
POSSIBILITY OF SUCH DAMAGES.
|
279
|
+
|
280
|
+
END OF TERMS AND CONDITIONS
|
281
|
+
|
282
|
+
How to Apply These Terms to Your New Programs
|
283
|
+
|
284
|
+
If you develop a new program, and you want it to be of the greatest
|
285
|
+
possible use to the public, the best way to achieve this is to make it
|
286
|
+
free software which everyone can redistribute and change under these terms.
|
287
|
+
|
288
|
+
To do so, attach the following notices to the program. It is safest
|
289
|
+
to attach them to the start of each source file to most effectively
|
290
|
+
convey the exclusion of warranty; and each file should have at least
|
291
|
+
the "copyright" line and a pointer to where the full notice is found.
|
292
|
+
|
293
|
+
<one line to give the program's name and a brief idea of what it does.>
|
294
|
+
Copyright (C) <year> <name of author>
|
295
|
+
|
296
|
+
This program is free software; you can redistribute it and/or modify
|
297
|
+
it under the terms of the GNU General Public License as published by
|
298
|
+
the Free Software Foundation; either version 2 of the License, or
|
299
|
+
(at your option) any later version.
|
300
|
+
|
301
|
+
This program is distributed in the hope that it will be useful,
|
302
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
303
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
304
|
+
GNU General Public License for more details.
|
305
|
+
|
306
|
+
You should have received a copy of the GNU General Public License along
|
307
|
+
with this program; if not, write to the Free Software Foundation, Inc.,
|
308
|
+
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
309
|
+
|
310
|
+
Also add information on how to contact you by electronic and paper mail.
|
311
|
+
|
312
|
+
If the program is interactive, make it output a short notice like this
|
313
|
+
when it starts in an interactive mode:
|
314
|
+
|
315
|
+
Gnomovision version 69, Copyright (C) year name of author
|
316
|
+
Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
|
317
|
+
This is free software, and you are welcome to redistribute it
|
318
|
+
under certain conditions; type `show c' for details.
|
319
|
+
|
320
|
+
The hypothetical commands `show w' and `show c' should show the appropriate
|
321
|
+
parts of the General Public License. Of course, the commands you use may
|
322
|
+
be called something other than `show w' and `show c'; they could even be
|
323
|
+
mouse-clicks or menu items--whatever suits your program.
|
324
|
+
|
325
|
+
You should also get your employer (if you work as a programmer) or your
|
326
|
+
school, if any, to sign a "copyright disclaimer" for the program, if
|
327
|
+
necessary. Here is a sample; alter the names:
|
328
|
+
|
329
|
+
Yoyodyne, Inc., hereby disclaims all copyright interest in the program
|
330
|
+
`Gnomovision' (which makes passes at compilers) written by James Hacker.
|
331
|
+
|
332
|
+
<signature of Ty Coon>, 1 April 1989
|
333
|
+
Ty Coon, President of Vice
|
334
|
+
|
335
|
+
This General Public License does not permit incorporating your program into
|
336
|
+
proprietary programs. If your program is a subroutine library, you may
|
337
|
+
consider it more useful to permit linking proprietary applications with the
|
338
|
+
library. If this is what you want to do, use the GNU Lesser General
|
339
|
+
Public License instead of this License.
|
data/README.md
ADDED
@@ -0,0 +1,127 @@
|
|
1
|
+
# plaintext
|
2
|
+
|
3
|
+
This gem wraps command line tools to extract plain text from typical files such as
|
4
|
+
|
5
|
+
- PDF
|
6
|
+
- RTF
|
7
|
+
- MS Office
|
8
|
+
- Word (doc, docx)
|
9
|
+
- Excel (xsl, xslx)
|
10
|
+
- PowerPoint (ppt, pptx)
|
11
|
+
- OpenOffice + Libre
|
12
|
+
- Presentation
|
13
|
+
- Text
|
14
|
+
- Spreadsheet
|
15
|
+
- Image files (png, jpeg, tiff), such as screenshots and scanned documents, through character recognition (OCR)
|
16
|
+
- Plaintext (txt)
|
17
|
+
- Comma-separated values (csv)
|
18
|
+
|
19
|
+
## Acknowledgements
|
20
|
+
|
21
|
+
This gem bases on work by Jens Krämer / Planio, who originally provided it as a
|
22
|
+
[patch for Redmine](https://www.redmine.org/issues/306). Now, it is a collaborative effort of
|
23
|
+
both project management software providers [Planio](https://plan.io) and [OpenProject](https://openproject.org)
|
24
|
+
as both systems tackle the identical challenge to extract plain text from attachment files.
|
25
|
+
|
26
|
+
## Installation
|
27
|
+
|
28
|
+
Add this line to your application's Gemfile:
|
29
|
+
|
30
|
+
```ruby
|
31
|
+
gem 'plaintext'
|
32
|
+
```
|
33
|
+
|
34
|
+
And then execute:
|
35
|
+
|
36
|
+
$ bundle
|
37
|
+
|
38
|
+
Or install it yourself as:
|
39
|
+
|
40
|
+
$ gem install plaintext
|
41
|
+
|
42
|
+
#### Rails
|
43
|
+
|
44
|
+
In a Rails application save `plaintext.yml.example` in `config/plaintext.yml` and overwrite the settings to
|
45
|
+
your needs.
|
46
|
+
|
47
|
+
Then load that configuration file in an initializer. Add the following lines to `config/initializers/plaintext.rb`:
|
48
|
+
|
49
|
+
```ruby
|
50
|
+
file_name = File.join([Rails.root.to_s, 'config', 'plaintext'])
|
51
|
+
if File.file?(file_name)
|
52
|
+
config_file = File.read(file_name)
|
53
|
+
Plaintext::Configuration.load(config_file)
|
54
|
+
end
|
55
|
+
````
|
56
|
+
|
57
|
+
#### Plain Ruby
|
58
|
+
|
59
|
+
Please overwrite `Plaintext::Configuration.load`.
|
60
|
+
|
61
|
+
### Linux
|
62
|
+
|
63
|
+
On linux the default configuration should work. However, make sure that the following packages are installed
|
64
|
+
|
65
|
+
$ apt-get install catdoc unrtf poppler-utils tesseract-ocr
|
66
|
+
|
67
|
+
### Mac OS X
|
68
|
+
|
69
|
+
On Mac things are still not complete. Please help us to have the same capabilities as under Linux. Right now we cannot
|
70
|
+
extract text from presentation and spreadsheets.
|
71
|
+
|
72
|
+
Please use homebrew to install the missing command line tools.
|
73
|
+
|
74
|
+
$ brew install unrtf poppler tesseract
|
75
|
+
|
76
|
+
The `plaintext.yml` should look like this:
|
77
|
+
|
78
|
+
```yml
|
79
|
+
pdftotext:
|
80
|
+
- /usr/local/bin/pdftotext
|
81
|
+
- -enc
|
82
|
+
- UTF-8
|
83
|
+
- __FILE__
|
84
|
+
- '-'
|
85
|
+
|
86
|
+
unrtf:
|
87
|
+
- /usr/local/bin/unrtf
|
88
|
+
- --text
|
89
|
+
- __FILE__
|
90
|
+
|
91
|
+
tesseract:
|
92
|
+
- /usr/local/bin/tesseract
|
93
|
+
- __FILE__
|
94
|
+
- stdout
|
95
|
+
|
96
|
+
catdoc:
|
97
|
+
- /usr/bin/textutil
|
98
|
+
- -convert
|
99
|
+
- txt
|
100
|
+
- -stdout
|
101
|
+
- __FILE__
|
102
|
+
```
|
103
|
+
|
104
|
+
## Usage
|
105
|
+
|
106
|
+
```ruby
|
107
|
+
# `file` is of type File.
|
108
|
+
# `content_type` is a String.
|
109
|
+
fulltext = Plaintext::Resolver.new(file, content_type).text
|
110
|
+
```
|
111
|
+
|
112
|
+
## License
|
113
|
+
|
114
|
+
The `plaintext` gem is free software; you can redistribute it and/or modify it under the terms of the GNU General
|
115
|
+
Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any
|
116
|
+
later version.
|
117
|
+
|
118
|
+
This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied
|
119
|
+
warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
|
120
|
+
|
121
|
+
You should have received a copy of the GNU General Public License along with the plugin. If not, see
|
122
|
+
[www.gnu.org/licenses](https://www.gnu.org/licenses/).
|
123
|
+
|
124
|
+
## Contributing
|
125
|
+
|
126
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/planio-gmbh/plaintext.
|
127
|
+
|
data/Rakefile
ADDED
data/bin/console
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "bundler/setup"
|
4
|
+
require "plaintext"
|
5
|
+
|
6
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
+
# with your gem easier. You can also use a different console, if you like.
|
8
|
+
|
9
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
+
# require "pry"
|
11
|
+
# Pry.start
|
12
|
+
|
13
|
+
require "irb"
|
14
|
+
IRB.start
|
data/bin/setup
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Plaintext
|
4
|
+
module CodesetUtil
|
5
|
+
def self.to_utf8(str, encoding)
|
6
|
+
return str if str.nil?
|
7
|
+
str.force_encoding('ASCII-8BIT')
|
8
|
+
if str.empty?
|
9
|
+
str.force_encoding('UTF-8')
|
10
|
+
return str
|
11
|
+
end
|
12
|
+
enc = (encoding.nil? || encoding.size == 0) ? 'UTF-8' : encoding
|
13
|
+
if enc.upcase != 'UTF-8'
|
14
|
+
str.force_encoding(enc)
|
15
|
+
str = str.encode('UTF-8', invalid: :replace,
|
16
|
+
undef: :replace, replace: '?')
|
17
|
+
else
|
18
|
+
str.force_encoding('UTF-8')
|
19
|
+
if !str.valid_encoding?
|
20
|
+
str = str.encode('US-ASCII', invalid: :replace,
|
21
|
+
undef: :replace, replace: '?').encode('UTF-8')
|
22
|
+
end
|
23
|
+
end
|
24
|
+
str
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Plaintext
|
4
|
+
module Configuration
|
5
|
+
class << self
|
6
|
+
attr_accessor :config
|
7
|
+
|
8
|
+
# Returns a configuration setting
|
9
|
+
def [](name)
|
10
|
+
load if self.config.nil?
|
11
|
+
self.config[name]
|
12
|
+
end
|
13
|
+
|
14
|
+
def load(config_file = nil)
|
15
|
+
self.config = {}
|
16
|
+
return unless config_file
|
17
|
+
|
18
|
+
file_config = YAML::load(ERB.new(config_file).result)
|
19
|
+
if file_config.is_a?(Hash)
|
20
|
+
self.config = file_config
|
21
|
+
else
|
22
|
+
warn "`config_file` is not a valid Plaintext configuration file, ignoring."
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Plaintext
|
4
|
+
class DocHandler < ExternalCommandHandler
|
5
|
+
CONTENT_TYPES = [
|
6
|
+
'application/vnd.ms-word',
|
7
|
+
'application/msword'
|
8
|
+
]
|
9
|
+
DEFAULT = [
|
10
|
+
'/usr/bin/catdoc', '-dutf-8', '__FILE__'
|
11
|
+
]
|
12
|
+
def initialize
|
13
|
+
@content_types = CONTENT_TYPES
|
14
|
+
@command = Plaintext::Configuration['catdoc'] || DEFAULT
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Plaintext
|
4
|
+
class ImageHandler < ExternalCommandHandler
|
5
|
+
CONTENT_TYPES = [
|
6
|
+
'image/jpeg',
|
7
|
+
'image/png',
|
8
|
+
'image/tiff'
|
9
|
+
]
|
10
|
+
DEFAULT = [
|
11
|
+
'/usr/bin/tesseract', '__FILE__', 'stdout'
|
12
|
+
].freeze
|
13
|
+
def initialize
|
14
|
+
@content_types = CONTENT_TYPES
|
15
|
+
@command = Plaintext::Configuration['tesseract'] || DEFAULT
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Plaintext
|
4
|
+
class PdfHandler < ExternalCommandHandler
|
5
|
+
DEFAULT = [
|
6
|
+
'/usr/bin/pdftotext', '-enc', 'UTF-8', '__FILE__', '-'
|
7
|
+
].freeze
|
8
|
+
def initialize
|
9
|
+
@content_type = 'application/pdf'
|
10
|
+
@command = Plaintext::Configuration['pdftotext'] || DEFAULT
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Plaintext
|
4
|
+
class PptHandler < ExternalCommandHandler
|
5
|
+
CONTENT_TYPES = [
|
6
|
+
'application/vnd.ms-powerpoint',
|
7
|
+
'application/powerpoint',
|
8
|
+
]
|
9
|
+
DEFAULT = [
|
10
|
+
'/usr/bin/catppt', '-dutf-8', '__FILE__'
|
11
|
+
]
|
12
|
+
def initialize
|
13
|
+
@content_types = CONTENT_TYPES
|
14
|
+
@command = Plaintext::Configuration['catppt'] || DEFAULT
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Plaintext
|
4
|
+
class RtfHandler < ExternalCommandHandler
|
5
|
+
DEFAULT = [
|
6
|
+
'/usr/bin/unrtf', '--text', '__FILE__'
|
7
|
+
].freeze
|
8
|
+
def initialize
|
9
|
+
@content_type = 'application/rtf'
|
10
|
+
@command = Plaintext::Configuration['unrtf'] || DEFAULT
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Plaintext
|
4
|
+
class XlsHandler < ExternalCommandHandler
|
5
|
+
CONTENT_TYPES = [
|
6
|
+
'application/vnd.ms-excel',
|
7
|
+
'application/excel'
|
8
|
+
]
|
9
|
+
DEFAULT = [
|
10
|
+
'/usr/bin/xls2csv', '-dutf-8', '__FILE__'
|
11
|
+
]
|
12
|
+
def initialize
|
13
|
+
@content_types = CONTENT_TYPES
|
14
|
+
@command = Plaintext::Configuration['xls2csv'] || DEFAULT
|
15
|
+
end
|
16
|
+
def text(*_)
|
17
|
+
if str = super
|
18
|
+
str.delete('"').gsub /,+/, ' '
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'pathname'
|
4
|
+
|
5
|
+
module Plaintext
|
6
|
+
class ExternalCommandHandler < FileHandler
|
7
|
+
# TODO: Extract this to a proper module
|
8
|
+
# Executes the given command through IO.popen and yields an IO object
|
9
|
+
# representing STDIN / STDOUT
|
10
|
+
#
|
11
|
+
# Due to how popen works the command will be executed directly without
|
12
|
+
# involving the shell if cmd is an array.
|
13
|
+
require 'fileutils'
|
14
|
+
def shellout(cmd, options = {}, &block)
|
15
|
+
mode = "r+"
|
16
|
+
IO.popen(cmd, mode) do |io|
|
17
|
+
io.set_encoding("ASCII-8BIT") if io.respond_to?(:set_encoding)
|
18
|
+
io.close_write unless options[:write_stdin]
|
19
|
+
block.call(io) if block_given?
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
FILE_PLACEHOLDER = '__FILE__'.freeze
|
24
|
+
|
25
|
+
def text(file)
|
26
|
+
cmd = @command.dup
|
27
|
+
cmd[cmd.index(FILE_PLACEHOLDER)] = Pathname(file).to_s
|
28
|
+
shellout(cmd){ |io| io.read }.to_s
|
29
|
+
end
|
30
|
+
|
31
|
+
def accept?(content_type)
|
32
|
+
super and available?
|
33
|
+
end
|
34
|
+
|
35
|
+
def available?
|
36
|
+
@command.present? and File.executable?(@command[0])
|
37
|
+
end
|
38
|
+
|
39
|
+
def self.available?
|
40
|
+
new.available?
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Plaintext
|
4
|
+
class PlaintextHandler < FileHandler
|
5
|
+
CONTENT_TYPES = %w(text/csv text/plain)
|
6
|
+
def initialize
|
7
|
+
@content_types = CONTENT_TYPES
|
8
|
+
end
|
9
|
+
|
10
|
+
def text(file)
|
11
|
+
Plaintext::CodesetUtil.to_utf8 IO.read(file), 'UTF-8'
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Plaintext
|
4
|
+
class DocxHandler < OfficeDocumentHandler
|
5
|
+
def initialize
|
6
|
+
super
|
7
|
+
@content_type = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
|
8
|
+
@file_name = 'word/document.xml'
|
9
|
+
@namespace_uri = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Plaintext
|
4
|
+
class PptxHandler < OfficeDocumentHandler
|
5
|
+
CONTENT_TYPES = [
|
6
|
+
'application/vnd.openxmlformats-officedocument.presentationml.presentation',
|
7
|
+
'application/vnd.openxmlformats-officedocument.presentationml.slideshow',
|
8
|
+
'application/vnd.ms-powerpoint.template.macroEnabled.12'
|
9
|
+
]
|
10
|
+
|
11
|
+
def initialize
|
12
|
+
super
|
13
|
+
@content_types = CONTENT_TYPES
|
14
|
+
@namespace_uri = 'http://schemas.openxmlformats.org/drawingml/2006/main'
|
15
|
+
end
|
16
|
+
|
17
|
+
def text(file)
|
18
|
+
slides = []
|
19
|
+
Zip::File.open(file) do |zip_file|
|
20
|
+
zip_file.each do |entry|
|
21
|
+
if entry.name =~ /slide(\d+)\.xml/
|
22
|
+
slides << [$1, xml_to_text(entry.get_input_stream)]
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
slides.sort!{|a, b| a.first <=> b.first}
|
27
|
+
slides.map(&:last).join ' '
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Plaintext
|
4
|
+
class XlsxHandler < OfficeDocumentHandler
|
5
|
+
def initialize
|
6
|
+
super
|
7
|
+
@content_type = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
|
8
|
+
@file_name = 'xl/sharedStrings.xml'
|
9
|
+
@namespace_uri = 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Plaintext
|
4
|
+
# Extractor for Open / Libre Office formats
|
5
|
+
class OpendocumentHandler < ZippedXmlHandler
|
6
|
+
CONTENT_TYPES = [
|
7
|
+
'application/vnd.oasis.opendocument.presentation',
|
8
|
+
'application/vnd.oasis.opendocument.presentation-template',
|
9
|
+
'application/vnd.oasis.opendocument.text',
|
10
|
+
'application/vnd.oasis.opendocument.text-template',
|
11
|
+
'application/vnd.oasis.opendocument.spreadsheet',
|
12
|
+
'application/vnd.oasis.opendocument.spreadsheet-template'
|
13
|
+
]
|
14
|
+
def initialize
|
15
|
+
super
|
16
|
+
@file_name = 'content.xml'
|
17
|
+
@content_types = CONTENT_TYPES
|
18
|
+
@element = 'p'
|
19
|
+
@namespace_uri = 'urn:oasis:names:tc:opendocument:xmlns:text:1.0'
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Plaintext
|
4
|
+
# Handler base class for XML based (MS / Open / Libre) office documents.
|
5
|
+
class ZippedXmlHandler < FileHandler
|
6
|
+
require 'zip'
|
7
|
+
require 'nokogiri'
|
8
|
+
|
9
|
+
class SaxDocument < Nokogiri::XML::SAX::Document
|
10
|
+
attr_reader :text
|
11
|
+
|
12
|
+
def initialize(text_element, text_namespace)
|
13
|
+
@element = text_element
|
14
|
+
@namespace_uri = text_namespace
|
15
|
+
@text = ''.dup
|
16
|
+
@is_text = false
|
17
|
+
end
|
18
|
+
|
19
|
+
# Handle each element, expecting the name and any attributes
|
20
|
+
def start_element_namespace(name, attrs = [], prefix = nil, uri = nil, ns = [])
|
21
|
+
if name == @element and uri == @namespace_uri
|
22
|
+
@is_text = true
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
# Any characters between the start and end element expected as a string
|
27
|
+
def characters(string)
|
28
|
+
@text << string if @is_text
|
29
|
+
end
|
30
|
+
|
31
|
+
# Given the name of an element once its closing tag is reached
|
32
|
+
def end_element_namespace(name, prefix = nil, uri = nil)
|
33
|
+
if name == @element and uri == @namespace_uri
|
34
|
+
@text << ' '
|
35
|
+
@is_text = false
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def text(file)
|
41
|
+
Zip::File.open(file) do |zip_file|
|
42
|
+
zip_file.each do |entry|
|
43
|
+
if entry.name == @file_name
|
44
|
+
return xml_to_text entry.get_input_stream
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
private
|
51
|
+
|
52
|
+
def xml_to_text(io)
|
53
|
+
sax_doc = SaxDocument.new @element, @namespace_uri
|
54
|
+
Nokogiri::XML::SAX::Parser.new(sax_doc).parse(io)
|
55
|
+
sax_doc.text
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Plaintext
|
4
|
+
class FileHandler
|
5
|
+
def accept?(content_type)
|
6
|
+
if @content_type
|
7
|
+
content_type == @content_type
|
8
|
+
elsif @content_types
|
9
|
+
@content_types.include? content_type
|
10
|
+
else
|
11
|
+
false
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Plaintext
|
4
|
+
class Resolver
|
5
|
+
MAX_FULLTEXT_LENGTH = 4_194_304 # 4 megabytes
|
6
|
+
|
7
|
+
class << self
|
8
|
+
attr_accessor :cached_file_handlers
|
9
|
+
|
10
|
+
HANDLERS = [
|
11
|
+
Plaintext::PdfHandler,
|
12
|
+
Plaintext::OpendocumentHandler,
|
13
|
+
Plaintext::DocxHandler, Plaintext::XlsxHandler, Plaintext::PptxHandler,
|
14
|
+
Plaintext::DocHandler, Plaintext::XlsHandler, Plaintext::PptHandler,
|
15
|
+
Plaintext::ImageHandler,
|
16
|
+
Plaintext::RtfHandler,
|
17
|
+
Plaintext::PlaintextHandler
|
18
|
+
].freeze
|
19
|
+
|
20
|
+
def file_handlers
|
21
|
+
return self.cached_file_handlers if self.cached_file_handlers.present?
|
22
|
+
self.cached_file_handlers = HANDLERS.map(&:new)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def initialize(file, content_type = nil)
|
27
|
+
@file = file
|
28
|
+
@content_type = content_type
|
29
|
+
end
|
30
|
+
|
31
|
+
# Returns the extracted fulltext or nil if no matching handler was found
|
32
|
+
# for the file type.
|
33
|
+
def text
|
34
|
+
if handler = find_handler and text = handler.text(@file)
|
35
|
+
text.gsub! /\s+/m, ' '
|
36
|
+
text.strip!
|
37
|
+
text.mb_chars.compose.limit(MAX_FULLTEXT_LENGTH).to_s
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
private
|
42
|
+
|
43
|
+
def find_handler
|
44
|
+
self.class.file_handlers.detect { |h| h.accept? @content_type }
|
45
|
+
end
|
46
|
+
|
47
|
+
end
|
48
|
+
end
|
data/lib/plaintext.rb
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'active_support/core_ext/string'
|
4
|
+
|
5
|
+
require 'plaintext/version'
|
6
|
+
require 'plaintext/configuration'
|
7
|
+
|
8
|
+
require 'plaintext/codeset_util'
|
9
|
+
|
10
|
+
require 'plaintext/file_handler'
|
11
|
+
require 'plaintext/file_handler/external_command_handler'
|
12
|
+
require 'plaintext/file_handler/external_command_handler/doc_handler'
|
13
|
+
require 'plaintext/file_handler/external_command_handler/image_handler'
|
14
|
+
require 'plaintext/file_handler/external_command_handler/pdf_handler'
|
15
|
+
require 'plaintext/file_handler/external_command_handler/ppt_handler'
|
16
|
+
require 'plaintext/file_handler/external_command_handler/rtf_handler'
|
17
|
+
require 'plaintext/file_handler/external_command_handler/xls_handler'
|
18
|
+
|
19
|
+
require 'plaintext/file_handler/zipped_xml_handler'
|
20
|
+
require 'plaintext/file_handler/zipped_xml_handler/office_document_handler'
|
21
|
+
require 'plaintext/file_handler/zipped_xml_handler/office_document_handler/docx_handler'
|
22
|
+
require 'plaintext/file_handler/zipped_xml_handler/office_document_handler/pptx_handler'
|
23
|
+
require 'plaintext/file_handler/zipped_xml_handler/office_document_handler/xlsx_handler'
|
24
|
+
require 'plaintext/file_handler/zipped_xml_handler/opendocument_handler'
|
25
|
+
|
26
|
+
require 'plaintext/file_handler/plaintext_handler'
|
27
|
+
|
28
|
+
require 'plaintext/resolver'
|
data/plaintext.gemspec
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'plaintext/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "plaintext"
|
8
|
+
spec.version = Plaintext::VERSION
|
9
|
+
spec.authors = ['Jens Krämer', 'Planio GmbH', 'OpenProject GmbH']
|
10
|
+
spec.email = ['info@openproject.com']
|
11
|
+
|
12
|
+
spec.summary = 'Extract plain text from most common office documents.'
|
13
|
+
spec.description = "Extract text from common office files. Based on the file's content type a command line tool is selected to do the job."
|
14
|
+
spec.homepage = 'https://github.com/planio-gmbh/plaintext'
|
15
|
+
spec.license = 'GPL-2.0'
|
16
|
+
|
17
|
+
spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
18
|
+
spec.bindir = "exe"
|
19
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
20
|
+
spec.require_paths = ["lib"]
|
21
|
+
|
22
|
+
spec.add_dependency 'rubyzip', '~> 1.2.1'
|
23
|
+
spec.add_dependency 'nokogiri', '~> 1.8.1'
|
24
|
+
spec.add_dependency 'activesupport', '>2.2.1 '
|
25
|
+
|
26
|
+
spec.add_development_dependency "bundler", "~> 1.10"
|
27
|
+
spec.add_development_dependency "rake", "~> 10.0"
|
28
|
+
spec.add_development_dependency "rspec"
|
29
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
# This is an example configuration file. Copy this file to your application config folder. In a Rails application that
|
2
|
+
# would be `<Rails.root>/config/plaintext.yml`
|
3
|
+
#
|
4
|
+
# Text extraction helper programs.
|
5
|
+
#
|
6
|
+
# commands should write the resulting plain text to STDOUT. Use __FILE__ as
|
7
|
+
# placeholder for the file path. The values below are the defaults.
|
8
|
+
|
9
|
+
# apt-get install poppler-utils
|
10
|
+
# pdftotext:
|
11
|
+
# - /usr/bin/pdftotext
|
12
|
+
# - -enc
|
13
|
+
# - UTF-8
|
14
|
+
# - __FILE__
|
15
|
+
# - '-'
|
16
|
+
|
17
|
+
# apt-get install unrtf
|
18
|
+
# unrtf:
|
19
|
+
# - /usr/bin/unrtf
|
20
|
+
# - --text
|
21
|
+
# - __FILE__
|
22
|
+
|
23
|
+
# apt-get install catdoc
|
24
|
+
# catdoc:
|
25
|
+
# - /usr/bin/catdoc
|
26
|
+
# - -dutf-8
|
27
|
+
# - __FILE__
|
28
|
+
# xls2csv:
|
29
|
+
# - /usr/bin/xls2csv
|
30
|
+
# - -dutf-8
|
31
|
+
# - __FILE__
|
32
|
+
# catppt:
|
33
|
+
# - /usr/bin/catppt
|
34
|
+
# - -dutf-8
|
35
|
+
# - __FILE__
|
36
|
+
|
37
|
+
# apt-get install tesseract-ocr
|
38
|
+
# tesseract:
|
39
|
+
# - /usr/bin/tesseract
|
40
|
+
# - -dutf-8
|
41
|
+
# - __FILE__
|
metadata
ADDED
@@ -0,0 +1,162 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: plaintext
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Jens Krämer
|
8
|
+
- Planio GmbH
|
9
|
+
- OpenProject GmbH
|
10
|
+
autorequire:
|
11
|
+
bindir: exe
|
12
|
+
cert_chain: []
|
13
|
+
date: 2018-02-15 00:00:00.000000000 Z
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: rubyzip
|
17
|
+
requirement: !ruby/object:Gem::Requirement
|
18
|
+
requirements:
|
19
|
+
- - "~>"
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: 1.2.1
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
requirements:
|
26
|
+
- - "~>"
|
27
|
+
- !ruby/object:Gem::Version
|
28
|
+
version: 1.2.1
|
29
|
+
- !ruby/object:Gem::Dependency
|
30
|
+
name: nokogiri
|
31
|
+
requirement: !ruby/object:Gem::Requirement
|
32
|
+
requirements:
|
33
|
+
- - "~>"
|
34
|
+
- !ruby/object:Gem::Version
|
35
|
+
version: 1.8.1
|
36
|
+
type: :runtime
|
37
|
+
prerelease: false
|
38
|
+
version_requirements: !ruby/object:Gem::Requirement
|
39
|
+
requirements:
|
40
|
+
- - "~>"
|
41
|
+
- !ruby/object:Gem::Version
|
42
|
+
version: 1.8.1
|
43
|
+
- !ruby/object:Gem::Dependency
|
44
|
+
name: activesupport
|
45
|
+
requirement: !ruby/object:Gem::Requirement
|
46
|
+
requirements:
|
47
|
+
- - ">"
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
version: 2.2.1
|
50
|
+
type: :runtime
|
51
|
+
prerelease: false
|
52
|
+
version_requirements: !ruby/object:Gem::Requirement
|
53
|
+
requirements:
|
54
|
+
- - ">"
|
55
|
+
- !ruby/object:Gem::Version
|
56
|
+
version: 2.2.1
|
57
|
+
- !ruby/object:Gem::Dependency
|
58
|
+
name: bundler
|
59
|
+
requirement: !ruby/object:Gem::Requirement
|
60
|
+
requirements:
|
61
|
+
- - "~>"
|
62
|
+
- !ruby/object:Gem::Version
|
63
|
+
version: '1.10'
|
64
|
+
type: :development
|
65
|
+
prerelease: false
|
66
|
+
version_requirements: !ruby/object:Gem::Requirement
|
67
|
+
requirements:
|
68
|
+
- - "~>"
|
69
|
+
- !ruby/object:Gem::Version
|
70
|
+
version: '1.10'
|
71
|
+
- !ruby/object:Gem::Dependency
|
72
|
+
name: rake
|
73
|
+
requirement: !ruby/object:Gem::Requirement
|
74
|
+
requirements:
|
75
|
+
- - "~>"
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '10.0'
|
78
|
+
type: :development
|
79
|
+
prerelease: false
|
80
|
+
version_requirements: !ruby/object:Gem::Requirement
|
81
|
+
requirements:
|
82
|
+
- - "~>"
|
83
|
+
- !ruby/object:Gem::Version
|
84
|
+
version: '10.0'
|
85
|
+
- !ruby/object:Gem::Dependency
|
86
|
+
name: rspec
|
87
|
+
requirement: !ruby/object:Gem::Requirement
|
88
|
+
requirements:
|
89
|
+
- - ">="
|
90
|
+
- !ruby/object:Gem::Version
|
91
|
+
version: '0'
|
92
|
+
type: :development
|
93
|
+
prerelease: false
|
94
|
+
version_requirements: !ruby/object:Gem::Requirement
|
95
|
+
requirements:
|
96
|
+
- - ">="
|
97
|
+
- !ruby/object:Gem::Version
|
98
|
+
version: '0'
|
99
|
+
description: Extract text from common office files. Based on the file's content type
|
100
|
+
a command line tool is selected to do the job.
|
101
|
+
email:
|
102
|
+
- info@openproject.com
|
103
|
+
executables: []
|
104
|
+
extensions: []
|
105
|
+
extra_rdoc_files: []
|
106
|
+
files:
|
107
|
+
- ".gitignore"
|
108
|
+
- ".rspec"
|
109
|
+
- ".travis.yml"
|
110
|
+
- Gemfile
|
111
|
+
- LICENSE
|
112
|
+
- README.md
|
113
|
+
- Rakefile
|
114
|
+
- bin/console
|
115
|
+
- bin/setup
|
116
|
+
- lib/plaintext.rb
|
117
|
+
- lib/plaintext/codeset_util.rb
|
118
|
+
- lib/plaintext/configuration.rb
|
119
|
+
- lib/plaintext/file_handler.rb
|
120
|
+
- lib/plaintext/file_handler/external_command_handler.rb
|
121
|
+
- lib/plaintext/file_handler/external_command_handler/doc_handler.rb
|
122
|
+
- lib/plaintext/file_handler/external_command_handler/image_handler.rb
|
123
|
+
- lib/plaintext/file_handler/external_command_handler/pdf_handler.rb
|
124
|
+
- lib/plaintext/file_handler/external_command_handler/ppt_handler.rb
|
125
|
+
- lib/plaintext/file_handler/external_command_handler/rtf_handler.rb
|
126
|
+
- lib/plaintext/file_handler/external_command_handler/xls_handler.rb
|
127
|
+
- lib/plaintext/file_handler/plaintext_handler.rb
|
128
|
+
- lib/plaintext/file_handler/zipped_xml_handler.rb
|
129
|
+
- lib/plaintext/file_handler/zipped_xml_handler/office_document_handler.rb
|
130
|
+
- lib/plaintext/file_handler/zipped_xml_handler/office_document_handler/docx_handler.rb
|
131
|
+
- lib/plaintext/file_handler/zipped_xml_handler/office_document_handler/pptx_handler.rb
|
132
|
+
- lib/plaintext/file_handler/zipped_xml_handler/office_document_handler/xlsx_handler.rb
|
133
|
+
- lib/plaintext/file_handler/zipped_xml_handler/opendocument_handler.rb
|
134
|
+
- lib/plaintext/resolver.rb
|
135
|
+
- lib/plaintext/version.rb
|
136
|
+
- plaintext.gemspec
|
137
|
+
- plaintext.yml.example
|
138
|
+
homepage: https://github.com/planio-gmbh/plaintext
|
139
|
+
licenses:
|
140
|
+
- GPL-2.0
|
141
|
+
metadata: {}
|
142
|
+
post_install_message:
|
143
|
+
rdoc_options: []
|
144
|
+
require_paths:
|
145
|
+
- lib
|
146
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
147
|
+
requirements:
|
148
|
+
- - ">="
|
149
|
+
- !ruby/object:Gem::Version
|
150
|
+
version: '0'
|
151
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
152
|
+
requirements:
|
153
|
+
- - ">="
|
154
|
+
- !ruby/object:Gem::Version
|
155
|
+
version: '0'
|
156
|
+
requirements: []
|
157
|
+
rubyforge_project:
|
158
|
+
rubygems_version: 2.6.13
|
159
|
+
signing_key:
|
160
|
+
specification_version: 4
|
161
|
+
summary: Extract plain text from most common office documents.
|
162
|
+
test_files: []
|