plain_text 0.1 → 0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ChangeLog +20 -0
- data/LICENSE.txt +20 -0
- data/README.en.rdoc +1 -0
- data/lib/plain_text/parse_rule.rb +15 -1
- data/lib/plain_text.rb +103 -40
- data/plain_text.gemspec +6 -4
- data/test/test_plain_text.rb +25 -15
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2c015ed947812371558456375c2933f0d03720082899f8c58c699419eda77f1b
|
4
|
+
data.tar.gz: fafc479d9bb492bd3b3ad140ec7a58d2cc0e7bc49dec4ad80c4111ad1f63e3df
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cb7d054e24cc85c64bbb556d4de30b3b54c9b51b409519d9b7f307fbe64dc05dc32e6e7cbeccc027b41c842a31ec5b489e60801b1c1c1f72e587157f62f38391
|
7
|
+
data.tar.gz: aef2b0ebd0c69f694c438cbf8d8e62d6d754d92c5d804553649c681d6c088bd9bb363197d9fb209b184aa49fb44ef5e733268e1d53a19bc7dfef260c86dee88c
|
data/ChangeLog
CHANGED
@@ -1,3 +1,23 @@
|
|
1
|
+
-----
|
2
|
+
(Version: 0.2)
|
3
|
+
2019-10-27 Masa Sakano
|
4
|
+
* Plain Text.clean_text
|
5
|
+
* Option name and default changed from `firstsps_style=:truncate` to `firstlbs_style=:delete`
|
6
|
+
* Default of Option `linehead_style` changed from :delete to : none
|
7
|
+
* Option `sps_style` now ignores the line head and tail in a new private class method `clean_text_sps!`
|
8
|
+
* Fixed bugs, including the one for Option choice `linetail_style: :markdown`
|
9
|
+
* New constant ParseRule::RuleEachLineStrip
|
10
|
+
|
11
|
+
-----
|
12
|
+
(Version: 0.2)
|
13
|
+
2019-10-27 Masa Sakano
|
14
|
+
* Plain Text.clean_text
|
15
|
+
* Option name and default changed from `firstsps_style=:truncate` to `firstlbs_style=:delete`
|
16
|
+
* Default of Option `linehead_style` changed from :delete to : none
|
17
|
+
* Option `sps_style` now ignores the line head and tail in a new private class method `clean_text_sps!`
|
18
|
+
* Fixed bugs, including the one for Option choice `linetail_style: :markdown`
|
19
|
+
* New constant ParseRule::RuleEachLineStrip
|
20
|
+
|
1
21
|
-----
|
2
22
|
(Version: 0.1)
|
3
23
|
2019-10-25 Masa Sakano
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2012-2018 Scott Chacon and others
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.en.rdoc
CHANGED
@@ -81,6 +81,10 @@ module PlainText
|
|
81
81
|
# pt1.parts[0].parts[1] # => Paragraph::Title("Breaking!")
|
82
82
|
# pt1.boundaries[1] # => Boundary("\n======\n")
|
83
83
|
#
|
84
|
+
# @todo
|
85
|
+
# It would be smarter each instance (Regexp and Part) has its own "name"
|
86
|
+
# rather than this class holds @names as an Array.
|
87
|
+
#
|
84
88
|
# @author Masa Sakano (Wise Babel Ltd)
|
85
89
|
#
|
86
90
|
class ParseRule
|
@@ -462,13 +466,23 @@ module PlainText
|
|
462
466
|
|
463
467
|
def_lb_q = PlainText::DefLineBreaks.map{|i| Regexp.quote i}.join '|'
|
464
468
|
|
465
|
-
# {ParseRule} instance to
|
469
|
+
# {ParseRule} instance to
|
466
470
|
# split a String with 2 or more linebreaks (with potentially white-spaces in between).
|
467
471
|
# This instance can be dup-ped and used normally. However, if it is clone-d, the cloned instance would be unmodifiable.
|
468
472
|
RuleConsecutiveLbs = self.new(/((?:#{def_lb_q})(?:#{def_lb_q}|[[:blank:]])*(?:#{def_lb_q}))/, name: 'ConsecutiveLbs') # => /((?:\r\n|\n|\r){2,}/
|
469
473
|
RuleConsecutiveLbs.freeze
|
470
474
|
RuleConsecutiveLbs.rules.freeze
|
471
475
|
RuleConsecutiveLbs.names.freeze
|
476
|
+
|
477
|
+
# {ParseRule} instance to
|
478
|
+
# split a String with 1 linebreak that is potentially sandwiched with white-spaces
|
479
|
+
# (or a whitespace(s) at the very beginning or end).
|
480
|
+
# Essentially, each line (after Ruby-strip-ped) is treated as Paragraph.
|
481
|
+
# This instance can be dup-ped and used normally. However, if it is clone-d, the cloned instance would be unmodifiable.
|
482
|
+
RuleEachLineStrip = self.new(/(\A[[:space:]]+|[[:space:]]*\n[[:space:]]*|[[:space:]]+\z)/, name: 'EachLineStrip') # => /((?:\r\n|\n|\r){2,}/
|
483
|
+
RuleEachLineStrip.freeze
|
484
|
+
RuleEachLineStrip.rules.freeze
|
485
|
+
RuleEachLineStrip.names.freeze
|
472
486
|
end # class ParseRule
|
473
487
|
end # module PlainText
|
474
488
|
|
data/lib/plain_text.rb
CHANGED
@@ -55,6 +55,7 @@ module PlainText
|
|
55
55
|
# @return [Integer]
|
56
56
|
def self.count_char(instr, *rest,
|
57
57
|
lbs_style: :delete,
|
58
|
+
linehead_style: :delete,
|
58
59
|
lastsps_style: :delete,
|
59
60
|
lb_out: "\n",
|
60
61
|
**k)
|
@@ -72,9 +73,9 @@ module PlainText
|
|
72
73
|
# * Blank lines are truncated into one line with no white spaces: +boundary_style=lb_out*2(=$/*2)+
|
73
74
|
# * Consecutive white spaces are truncated into a single space: +sps_style=:truncate+
|
74
75
|
# * White spaces before or after a CJK character is deleted: +delete_asian_space=true+
|
75
|
-
# * Preceding white spaces in each line are
|
76
|
+
# * Preceding white spaces in each line are preserved: +linehead_style=:none+
|
76
77
|
# * Trailing white spaces in each line are deleted: +linetail_style=:delete+
|
77
|
-
# *
|
78
|
+
# * Line-breaks at the beginning of the entire input string are deleted: +firstlbs_style=:delete+
|
78
79
|
# * Trailing white spaces and line-breaks at the end of the entire input string are truncated into a single linebreak: +lastsps_style=:truncate+
|
79
80
|
#
|
80
81
|
# For a String with predominantly CJK characters, the following setting is recommended:
|
@@ -85,19 +86,25 @@ module PlainText
|
|
85
86
|
# Note for the Symbols in optional arguments, the Symbol with the first character only is accepted,
|
86
87
|
# e.g., +:d+ instead of +:delete+ (nb., +:t2+ for +:truncate2+).
|
87
88
|
#
|
88
|
-
# For more detail, see the description.
|
89
|
+
# For more detail, see the description of each command-line options.
|
90
|
+
#
|
91
|
+
# Note that for the case of traditional genko-yoshi-style Japanese texts
|
92
|
+
# with "jisage" for each new paragraph marking a new paragraph, probably
|
93
|
+
# the best way is to make your own Part instance to give to this method,
|
94
|
+
# where the rule for the Part should be something like:
|
95
|
+
# /(\A[[:blank:]]+|\n[[:space:]]+)/
|
89
96
|
#
|
90
97
|
# @param prt [PlainText:Part, String] {Part} or String to examine.
|
91
98
|
# @param preserve_paragraph: [Boolean] Paragraphs are taken into account if true (Def: False). In the input, paragraphs are defined to be separated with more than one +lb+ with potentially some space characters in between. Their output style is specified with +boundary_style+.
|
92
99
|
# @param boundary_style: [String, Symbol] One of +(:truncate|:truncate2|:delete|:none)+ or String. If String, the boundaries between paragraphs are replaced with this String (Def: +lb_out*2+). If +:truncate+, consecutive linebreaks and spaces are truncated into 2 linebreaks. +:truncate2+ are similar, but they are not truncated beyond 3 linebreaks (ie., up to 2 blank lines between Paragraphs). If +:none+, nothing is done about them. Unless :none, all the white spaces between linebreaks are deleted.
|
93
100
|
# @param lbs_style: [Symbol] One of +(:truncate|:delete|:none)+ (Def: +:truncate+). If :delete, all the linebreaks within paragraphs are deleted. +:truncate+ is meaningful only when +preserve_paragraph=false+ and consecutive linebreaks are truncated into 1 linebreak.
|
94
|
-
# @param sps_style: [Symbol] One of +(:truncate|:delete|:none)+ (Def: +:truncate+). If +:truncate+, the consecutive white spaces within paragraphs are truncated into a single white space. If :delete, they are deleted.
|
101
|
+
# @param sps_style: [Symbol] One of +(:truncate|:delete|:none)+ (Def: +:truncate+). If +:truncate+, the consecutive white spaces within paragraphs, *except* for those at the line-head or line-tail (which are controlled by +linehead_style+ and +linehead_style+, respectively), are truncated into a single white space. If :delete, they are deleted.
|
95
102
|
# @param lb_is_space: [Boolean] If true, a line-break, except those for the boundaries (unless +preserve_paragraph+ is false), is equivalent to a space (Def: False).
|
96
103
|
# @param delete_asian_space: [Boolean] Any spaces between, before, after Asian characters (but punctuation) are deleted, if true (Default).
|
97
|
-
# @param linehead_style: [Symbol] One of +(:truncate|:delete|:none)+ (Def: :
|
98
|
-
# @param linetail_style: [Symbol] One of +(:truncate|:delete|:markdown|:none)+ (Def: :delete). Determine how to handle consecutive white spaces at the end of each line. If +:markdown
|
99
|
-
# @param
|
100
|
-
# @param lastsps_style: [Symbol, String] One of +(:truncate|:delete|:none|:linebreak)+ or String (Def: :truncate). If +:truncate+, any of white spaces
|
104
|
+
# @param linehead_style: [Symbol] One of +(:truncate|:delete|:none)+ (Def: :none). Determine how to handle consecutive white spaces at the beggining of each line.
|
105
|
+
# @param linetail_style: [Symbol] One of +(:truncate|:delete|:markdown|:none)+ (Def: :delete). Determine how to handle consecutive white spaces at the end of each line. If +:markdown, 1 space is always deleted, and two or more spaces are truncated into two ASCII whitespaces *if* the last two spaces are ASCII whitespaces, or else untouched.
|
106
|
+
# @param firstlbs_style: [Symbol, String] One of +(:truncate|:delete|:none)+ or String (Def: :default). If +:truncate+, any linebreaks at the very beginning of self (and whitespaces in between), if exist, are truncated to a single linebreak. If String, they are, even if not exists, replaced with the specified String (such as a linebreak). If +:delete+, they are deleted. Note This option has nothing to do with the whitespaces at the beginning of the first significant line (hence the name of the option). Note if a (random) Part is given, this option only considers the first significant element of it.
|
107
|
+
# @param lastsps_style: [Symbol, String] One of +(:truncate|:delete|:none|:linebreak)+ or String (Def: :truncate). If +:truncate+, any of linebreaks *AND* white spaces at the tail of self, if exist, are truncated to a single linebreak. If +:delete+, they are deleted. If String, they are, even if not exists, replaced with the specified String (such as a linebreak, in which case +lb_out+ is used as String, i.e., it guarantees only 1 linebreak to exist at the end of the String). Note if a (random) Part is given, this option only considers the last significant element of it.
|
101
108
|
# @param lb: [String] Linebreak character like +\n+ etc (Default: $/). If this is one of the standard line-breaks, irregular line-breaks (for example, existence of CR when only LF should be there) are corrected.
|
102
109
|
# @param lb_out: [String] Linebreak used for output (Default: +lb+)
|
103
110
|
# @return same as prt
|
@@ -110,16 +117,16 @@ module PlainText
|
|
110
117
|
lb_is_space: false,
|
111
118
|
sps_style: :truncate,
|
112
119
|
delete_asian_space: true,
|
113
|
-
linehead_style: :
|
120
|
+
linehead_style: :none,
|
114
121
|
linetail_style: :delete,
|
115
|
-
|
122
|
+
firstlbs_style: :delete,
|
116
123
|
lastsps_style: :truncate,
|
117
124
|
lb: $/,
|
118
125
|
lb_out: nil, # If unspecified, will be replaced with lb
|
119
126
|
is_debug: false
|
120
127
|
)
|
121
128
|
|
122
|
-
|
129
|
+
isdebug = true if prt == "\n \n abc\n\n \ndef\n\n \n\n"
|
123
130
|
lb_out ||= lb # Output linebreak
|
124
131
|
boundary_style = lb_out*2 if true == boundary_style
|
125
132
|
boundary_style = "" if [:delete, :d].include? boundary_style
|
@@ -128,7 +135,12 @@ module PlainText
|
|
128
135
|
if !prt.class.method_defined? :last_significant_element
|
129
136
|
# Construct a Part instance from the given String.
|
130
137
|
ret = ''
|
131
|
-
|
138
|
+
begin
|
139
|
+
prt = prt.unicode_normalize
|
140
|
+
rescue ArgumentError # (invalid byte sequence in UTF-8)
|
141
|
+
warn "The given String in (#{self.name}\##{__method__}) seems wrong."
|
142
|
+
raise
|
143
|
+
end
|
132
144
|
prt = normalize_lb(prt, "\n", lb_from: (DefLineBreaks.include?(lb) ? nil : lb)).dup
|
133
145
|
kwd = (["\r\n", "\r", "\n"].include?(lb) ? {} : { rules: /#{Regexp.quote lb}{2,}/})
|
134
146
|
prt = (preserve_paragraph ? Part.parse(prt, **kwd) : Part.new([prt]))
|
@@ -148,6 +160,7 @@ module PlainText
|
|
148
160
|
lb_is_space: lb_is_space,
|
149
161
|
sps_style: sps_style,
|
150
162
|
delete_asian_space: delete_asian_space,
|
163
|
+
is_debug: is_debug
|
151
164
|
)
|
152
165
|
# Handles the line head/tails.
|
153
166
|
clean_text_line_head_tail!( prt,
|
@@ -157,8 +170,9 @@ module PlainText
|
|
157
170
|
|
158
171
|
# Handles the file head/tail.
|
159
172
|
clean_text_file_head_tail!( prt,
|
160
|
-
|
173
|
+
firstlbs_style: firstlbs_style,
|
161
174
|
lastsps_style: lastsps_style,
|
175
|
+
is_debug: isdebug
|
162
176
|
)
|
163
177
|
|
164
178
|
# Replaces the linebreaks to the specified one
|
@@ -284,29 +298,29 @@ module PlainText
|
|
284
298
|
)
|
285
299
|
|
286
300
|
# Linebreaks and spaces
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
raise ArgumentError
|
297
|
-
end
|
301
|
+
case lbs_style
|
302
|
+
when :truncate, :t
|
303
|
+
prt.parts.each{|ec| ec.gsub!(/\n{2,}/m, "\n")}
|
304
|
+
when :delete, :d
|
305
|
+
prt.parts.each{|ec| ec.gsub!(/\n/m, "")}
|
306
|
+
when :none, :n
|
307
|
+
# Does nothing
|
308
|
+
else
|
309
|
+
raise ArgumentError
|
298
310
|
end
|
299
311
|
|
312
|
+
# Handles spaces in each line
|
313
|
+
clean_text_sps!(prt, sps_style: sps_style, is_debug: is_debug)
|
314
|
+
|
300
315
|
# Linebreaks become spaces
|
301
316
|
if lb_is_space
|
302
317
|
prt.parts.each{|ec| ec.gsub!(/\n/m, " ")}
|
303
|
-
|
318
|
+
clean_text_sps!(prt, sps_style: sps_style, is_debug: is_debug) if sps_style == :truncate
|
304
319
|
end
|
305
320
|
|
306
321
|
# Ignore spaces between, before, and after Asian characters.
|
307
322
|
if delete_asian_space
|
308
|
-
|
309
|
-
prt.parts.each do |ea_p|
|
323
|
+
prt.parts.each do |ea_p|
|
310
324
|
PlainText.extend_this(ea_p)
|
311
325
|
ea_p.delete_spaces_bw_cjk_european! # Destructive change in prt.
|
312
326
|
end
|
@@ -322,7 +336,7 @@ module PlainText
|
|
322
336
|
# @see Plaintext.clean_text
|
323
337
|
def self.clean_text_line_head_tail!(
|
324
338
|
prt,
|
325
|
-
linehead_style: :
|
339
|
+
linehead_style: :none,
|
326
340
|
linetail_style: :delete,
|
327
341
|
is_debug: false
|
328
342
|
)
|
@@ -348,7 +362,7 @@ module PlainText
|
|
348
362
|
when :markdown, :m
|
349
363
|
# Two spaces are preserved
|
350
364
|
prt.parts.each{|ec| ec.gsub!(/(?:^|(?<![[:blank:]]))[[:blank:]]$/, "")} # A single space is deleted.
|
351
|
-
prt.parts.each{|ec| ec.gsub!(/[[:blank:]]
|
365
|
+
prt.parts.each{|ec| ec.gsub!(/[[:blank:]]+ $/, " ")} # 3 or more spaces are truncated into 2 spaces, only IF the last two spaces are the ASCII spaces.
|
352
366
|
when :none, :n
|
353
367
|
# Do nothing
|
354
368
|
else
|
@@ -358,36 +372,47 @@ module PlainText
|
|
358
372
|
private_class_method :clean_text_line_head_tail!
|
359
373
|
|
360
374
|
# @param prt [PlainText:Part] (see Plaintext.clean_text#prt)
|
361
|
-
# @param
|
375
|
+
# @param firstlbs_style [Symbol, String] (see Plaintext.clean_text#firstlbs_style)
|
362
376
|
# @param lastsps_style [Symbol, String] (see Plaintext.clean_text#lastsps_style)
|
363
377
|
# @return [void]
|
364
378
|
#
|
365
379
|
# @see Plaintext.clean_text
|
366
380
|
def self.clean_text_file_head_tail!(
|
367
381
|
prt,
|
368
|
-
|
382
|
+
firstlbs_style: :delete,
|
369
383
|
lastsps_style: :truncate,
|
370
384
|
is_debug: false
|
371
385
|
)
|
372
386
|
|
373
387
|
# Handles the beginning of the given Part.
|
374
|
-
obj = prt.first_significant_element
|
375
|
-
# The first significant element is either Paragraph or Background.
|
376
|
-
#
|
388
|
+
obj = prt.first_significant_element || return
|
389
|
+
# The first significant element is either Paragraph or Background.
|
390
|
+
# obj may be nil.
|
377
391
|
|
378
|
-
case
|
392
|
+
case firstlbs_style
|
379
393
|
when String
|
380
|
-
|
394
|
+
# This assumes the first Background is not
|
395
|
+
# (1) containing any non-space characters,
|
396
|
+
# (2) white-spaces only AND the first Paragraph starts from a linebreak.
|
397
|
+
# You can assume it as long as String is the original input.
|
398
|
+
# However, if the input is Part, anything can be possible, like
|
399
|
+
# first multiple Backgrounds contain a linebreak for each, each of which
|
400
|
+
# follows an empty Paragraph...
|
401
|
+
# The thing is, if String is always returned, it is much easier
|
402
|
+
# to process after Part#join. However, the method may return Part.
|
403
|
+
# Therefore, you cannot do it!
|
404
|
+
# I explain it in the document in {self.clean_text}.
|
405
|
+
obj.sub!(/\A([[:space:]]*\n)?/m, firstlbs_style)
|
381
406
|
when :truncate, :t
|
382
|
-
# The initial blank lines, if exist, are truncated to a single "
|
383
|
-
obj.sub!(/\A[[:space:]]
|
407
|
+
# The initial blank lines, if exist, are truncated to a single "\n"
|
408
|
+
obj.sub!(/\A[[:space:]]*\n/m, "\n")
|
384
409
|
when :delete, :d
|
385
|
-
# The initial blank lines
|
410
|
+
# The initial blank lines are deleted.
|
386
411
|
obj.sub!(/\A[[:space:]]*\n/m, "")
|
387
412
|
when :none, :n
|
388
413
|
# Do nothing
|
389
414
|
else
|
390
|
-
raise ArgumentError, "Invalid
|
415
|
+
raise ArgumentError, "Invalid firstlbs_style (#{firstlbs_style.inspect}) is specified."
|
391
416
|
end
|
392
417
|
|
393
418
|
# Handles the end of the given Part.
|
@@ -423,6 +448,43 @@ module PlainText
|
|
423
448
|
private_class_method :clean_text_file_head_tail!
|
424
449
|
|
425
450
|
|
451
|
+
# Handles spaces within Paragraphs
|
452
|
+
#
|
453
|
+
# uses Part to transform a Paragraph into a Part
|
454
|
+
#
|
455
|
+
# @param prt [PlainText:Part] (see Plaintext.clean_text#prt)
|
456
|
+
# @param sps_style (see Plaintext.clean_text#sps_style)
|
457
|
+
# @return [void]
|
458
|
+
#
|
459
|
+
# @see Plaintext.clean_text
|
460
|
+
def self.clean_text_sps!(
|
461
|
+
prt,
|
462
|
+
sps_style: :truncate,
|
463
|
+
is_debug: false
|
464
|
+
)
|
465
|
+
|
466
|
+
prt.parts.each do |e_pa|
|
467
|
+
ru = ParseRule
|
468
|
+
# Each line treated as a Paragraph, and [[:space:]]+ between them as a Boundary.
|
469
|
+
# Then, to work on anything within a line except for line-head/tail is easy.
|
470
|
+
prt_para = Part.parse(e_pa, rule: ParseRule::RuleEachLineStrip).map_parts { |e_li|
|
471
|
+
case sps_style
|
472
|
+
when :truncate, :t
|
473
|
+
e_li.gsub(/[[:blank:]]{2,}/m, " ")
|
474
|
+
when :delete, :d
|
475
|
+
e_li.gsub(/[[:blank:]]+/m, "")
|
476
|
+
when :none, :n
|
477
|
+
e_li
|
478
|
+
else
|
479
|
+
raise ArgumentError
|
480
|
+
end
|
481
|
+
} # map_parts
|
482
|
+
e_pa.replace prt_para.join
|
483
|
+
end
|
484
|
+
end
|
485
|
+
private_class_method :clean_text_sps!
|
486
|
+
|
487
|
+
|
426
488
|
####################################################
|
427
489
|
# Instance methods
|
428
490
|
####################################################
|
@@ -438,6 +500,7 @@ module PlainText
|
|
438
500
|
# @return [Integer]
|
439
501
|
def count_char(*rest,
|
440
502
|
lbs_style: :delete,
|
503
|
+
linehead_style: :delete,
|
441
504
|
lastsps_style: :none,
|
442
505
|
lb_out: "\n",
|
443
506
|
**k)
|
data/plain_text.gemspec
CHANGED
@@ -1,17 +1,19 @@
|
|
1
1
|
# -*- encoding: utf-8 -*-
|
2
2
|
|
3
3
|
require 'rake'
|
4
|
+
require 'date'
|
4
5
|
|
5
6
|
Gem::Specification.new do |s|
|
6
|
-
s.name = %q{plain_text}
|
7
|
-
s.version = "0.
|
7
|
+
s.name = %q{plain_text}.sub(/.*/){|c| (c == File.basename(Dir.pwd)) ? c : raise("ERROR: s.name=(#{c}) in gemspec seems wrong!")}
|
8
|
+
s.version = "0.2"
|
8
9
|
# s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
9
10
|
%w(countchar).each do |f|
|
10
|
-
s.
|
11
|
+
path = s.bindir+'/'+f
|
12
|
+
File.executable?(path) ? s.executables << f : raise("ERROR: Executable (#{path}) is not executable!")
|
11
13
|
end
|
12
14
|
s.bindir = 'bin'
|
13
15
|
s.authors = ["Masa Sakano"]
|
14
|
-
s.date = %q{2019-10-
|
16
|
+
s.date = %q{2019-10-27}.sub(/.*/){|c| (Date.parse(c) == Date.today) ? c : raise("ERROR: s.date=(#{c}) is not today!")}
|
15
17
|
s.summary = %q{Module to handle Plain-Text}
|
16
18
|
s.description = %q{This module provides utility functions and methods to handle plain text, classes Part/Paragraph/Boundary to represent the logical structure of a document and ParseRule to describe the rules to parse plain text to produce a Part-type Ruby instance.}
|
17
19
|
# s.email = %q{abc@example.com}
|
data/test/test_plain_text.rb
CHANGED
@@ -77,9 +77,9 @@ class TestUnitPlainText < MiniTest::Test
|
|
77
77
|
def test_clean_text03
|
78
78
|
assert_raises(ArgumentError){ PT.clean_text("abc", boundary_style: nil) }
|
79
79
|
s1 = "abc \n \n def\n\n"
|
80
|
-
s20 = "abc\n \
|
81
|
-
s21 = "
|
82
|
-
s22 = "abc\n\
|
80
|
+
s20 = "abc\n \n def\n"
|
81
|
+
s21 = "abcXYZ defXYZ"
|
82
|
+
s22 = "abc\n\n def\n"
|
83
83
|
sr = PT.clean_text(s1, boundary_style: :none)
|
84
84
|
assert_equal s20, sr, prerr(s20, sr)
|
85
85
|
sr = PT.clean_text(s1, boundary_style: "XYZ")
|
@@ -105,36 +105,46 @@ class TestUnitPlainText < MiniTest::Test
|
|
105
105
|
assert_equal s22, sr, prerr(s22, sr)
|
106
106
|
|
107
107
|
s3 = "\nabc\n\ndef"
|
108
|
-
s41 =
|
108
|
+
s41 = "\nabc\n\ndefTT"
|
109
109
|
s42 = "\nabc\n\ndef"
|
110
|
-
sr = PT.clean_text(s3,
|
110
|
+
sr = PT.clean_text(s3, firstlbs_style: :truncate, lastsps_style: 'TT')
|
111
111
|
assert_equal s41, sr, prerr(s41, sr)
|
112
|
-
sr = PT.clean_text(s3,
|
112
|
+
sr = PT.clean_text(s3, firstlbs_style: :none, lastsps_style: :delete)
|
113
113
|
assert_equal s42, sr, prerr(s42, sr)
|
114
114
|
end
|
115
115
|
|
116
116
|
def test_clean_text_boundary01
|
117
|
-
assert_raises(ArgumentError){ PT.clean_text("abc", boundary_style: nil) }
|
118
117
|
s1 = "\n ab\n \ncd\n \n \n ef\n \n \n \n gh\n \n \n \n"
|
119
|
-
s21 =
|
118
|
+
s21 = "\n ab\n \ncd\n \n \n ef\n \n \n \n gh\n"
|
120
119
|
s22 = "\n ab\n\ncd\n\n ef\n\n gh\n\n"
|
121
|
-
s23 =
|
122
|
-
sr = PT.clean_text(s1, boundary_style: :n, lastsps_style: :t, linehead_style: :n,
|
120
|
+
s23 = "\n ab\n\ncd\n\n\n ef\n\n\n gh\n\n\n"
|
121
|
+
sr = PT.clean_text(s1, boundary_style: :n, lastsps_style: :t, linehead_style: :n, firstlbs_style: :t, sps_style: :n)
|
123
122
|
assert_equal s21, sr, prerr(s21, sr)
|
124
|
-
sr = PT.clean_text(s1, boundary_style: :t, lastsps_style: :n, linehead_style: :n,
|
123
|
+
sr = PT.clean_text(s1, boundary_style: :t, lastsps_style: :n, linehead_style: :n, firstlbs_style: :n, sps_style: :n)
|
125
124
|
assert_equal s22, sr, prerr(s22, sr)
|
126
|
-
sr = PT.clean_text(s1, boundary_style: :t2, lastsps_style: :n, linehead_style: :t,
|
125
|
+
sr = PT.clean_text(s1, boundary_style: :t2, lastsps_style: :n, linehead_style: :t, firstlbs_style: :n, sps_style: :n)
|
127
126
|
assert_equal s23, sr, prerr(s23, sr)
|
128
127
|
end
|
129
128
|
|
129
|
+
def test_clean_text_markdown01
|
130
|
+
s0 = "\n ab \n \n cd \n \n\n ef \n \ngh \t \n\nij \t \n\nkl \u3000 \n\nmn"
|
131
|
+
s21 = "\n ab\n\n cd \n\n ef \n\ngh \t \n\nij \n\nkl \u3000 \n\nmn"
|
132
|
+
s22 = " ab\n\n cd \n\n ef \n\ngh \t \n\nij \n\nkl \u3000 \n\nmn"
|
133
|
+
|
134
|
+
sr = PT.clean_text(s0, linehead_style: :n, linetail_style: :m, firstlbs_style: :none)
|
135
|
+
assert_equal s21, sr, prerr(s21, sr)
|
136
|
+
sr = PT.clean_text(s0, linehead_style: :n, linetail_style: :m, firstlbs_style: :delete)
|
137
|
+
assert_equal s22, sr, prerr(s22, sr)
|
138
|
+
end
|
139
|
+
|
130
140
|
def test_clean_text_part01
|
131
141
|
s0 = "\n \n abc\n\n \ndef\n\n \n\n"
|
132
|
-
s1 = "
|
142
|
+
s1 = "TT abc\n\ndef\n"
|
133
143
|
p00 = PT::Part.parse s0
|
134
144
|
p0 = PT::Part.parse s0
|
135
|
-
sr = PT.clean_text(s0,
|
145
|
+
sr = PT.clean_text(s0, firstlbs_style: 'TT')
|
136
146
|
assert_equal s1, sr, prerr(s1, sr)
|
137
|
-
sr = PT.clean_text(p0,
|
147
|
+
sr = PT.clean_text(p0, firstlbs_style: 'TT')
|
138
148
|
assert_equal PT::Part, sr.class
|
139
149
|
assert_equal s1, sr.join
|
140
150
|
assert_equal p00, p0, prerr(p00, p0) # p0 is deepcopied?
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: plain_text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: '0.
|
4
|
+
version: '0.2'
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Masa Sakano
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-10-
|
11
|
+
date: 2019-10-27 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: This module provides utility functions and methods to handle plain text,
|
14
14
|
classes Part/Paragraph/Boundary to represent the logical structure of a document
|
@@ -23,6 +23,7 @@ extra_rdoc_files:
|
|
23
23
|
files:
|
24
24
|
- ".gitignore"
|
25
25
|
- ChangeLog
|
26
|
+
- LICENSE.txt
|
26
27
|
- Makefile
|
27
28
|
- README.en.rdoc
|
28
29
|
- Rakefile
|
@@ -60,8 +61,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
60
61
|
- !ruby/object:Gem::Version
|
61
62
|
version: '0'
|
62
63
|
requirements: []
|
63
|
-
|
64
|
-
rubygems_version: 2.7.3
|
64
|
+
rubygems_version: 3.0.3
|
65
65
|
signing_key:
|
66
66
|
specification_version: 4
|
67
67
|
summary: Module to handle Plain-Text
|