plain_text 0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +51 -0
- data/ChangeLog +5 -0
- data/Makefile +23 -0
- data/README.en.rdoc +172 -0
- data/Rakefile +9 -0
- data/bin/countchar +89 -0
- data/lib/plain_text/parse_rule.rb +474 -0
- data/lib/plain_text/part/boundary.rb +44 -0
- data/lib/plain_text/part/paragraph.rb +35 -0
- data/lib/plain_text/part.rb +973 -0
- data/lib/plain_text/split.rb +103 -0
- data/lib/plain_text/util.rb +104 -0
- data/lib/plain_text.rb +839 -0
- data/plain_text.gemspec +49 -0
- data/test/test_plain_text.rb +280 -0
- data/test/test_plain_text_parse_rule.rb +146 -0
- data/test/test_plain_text_part.rb +353 -0
- data/test/test_plain_text_split.rb +78 -0
- metadata +72 -0
data/lib/plain_text.rb
ADDED
@@ -0,0 +1,839 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
|
3
|
+
# Utility methods for mainly line-based processing of String
|
4
|
+
#
|
5
|
+
# This module contains methods useful in processing a String object of a text file,
|
6
|
+
# that is, a String that contains an entire or a multiple-line part of a text file.
|
7
|
+
# The methods include normalizing the line-break codes, removing extra spaces from each line, etc.
|
8
|
+
# Many of the methods work on tha basis of a line. For example, {#head} and {#tail} methods
|
9
|
+
# work like the respective UNIX-shell commands, returning a specified line at the head/tail parts of self.
|
10
|
+
#
|
11
|
+
# Most methods in this module are meant to be included in String, except for a few module functions.
|
12
|
+
# It is however debatable whether it is a good practice to include a third-party module in the core class.
|
13
|
+
# This module contains a helper module function {PlainText.extend_this}, with which an object extends this module easily as Singleton if this module is not already included.
|
14
|
+
#
|
15
|
+
# A few methods in this module assume that {PlainText::Split} is included in String,
|
16
|
+
# which in default is the case, as soon as this file is read (by Ruby's require).
|
17
|
+
#
|
18
|
+
# @author Masa Sakano (Wise Babel Ltd)
|
19
|
+
#
|
20
|
+
module PlainText
|
21
|
+
|
22
|
+
# List of the default line breaks.
|
23
|
+
DefLineBreaks = [ "\r\n", "\n", "\r" ] # cf., Default in the present environment: $/
|
24
|
+
|
25
|
+
# Default number of lines to extract for {#head} and {#tail}
|
26
|
+
DEF_HEADTAIL_N_LINES = 10
|
27
|
+
|
28
|
+
# Call instance method as a Module function
|
29
|
+
#
|
30
|
+
# The return String includes {PlainText} as Singleton.
|
31
|
+
#
|
32
|
+
# @param method [Symbol] module method name
|
33
|
+
# @param instr [String] String that is examined.
|
34
|
+
# @return [#instr]
|
35
|
+
def self.__call_inst_method__(method, instr, *rest, **k)
|
36
|
+
newself = instr.clone
|
37
|
+
PlainText.extend_this(newself)
|
38
|
+
newself.public_send(method, *rest, **k)
|
39
|
+
end
|
40
|
+
|
41
|
+
# If the class of the obj does not "include" this module, do so in the singular class.
|
42
|
+
#
|
43
|
+
# @param obj [Object] Maybe String. For which a singular class def is run, if the condition is met.
|
44
|
+
# @return [TrueClass, NilClass] true if the singular class def is run. Else nil.
|
45
|
+
def self.extend_this(obj)
|
46
|
+
return nil if defined? obj.delete_spaces_bw_cjk_european!
|
47
|
+
obj.extend(PlainText)
|
48
|
+
true
|
49
|
+
end
|
50
|
+
|
51
|
+
# Module function of {#count_char}
|
52
|
+
#
|
53
|
+
# @param instr [String] String for which the number of chars is counted
|
54
|
+
# @param (see #count_char)
|
55
|
+
# @return [Integer]
|
56
|
+
def self.count_char(instr, *rest,
|
57
|
+
lbs_style: :delete,
|
58
|
+
lastsps_style: :delete,
|
59
|
+
lb_out: "\n",
|
60
|
+
**k)
|
61
|
+
clean_text(instr, *rest, lbs_style: lbs_style, lastsps_style: lastsps_style, lb_out: lb_out, **k).size
|
62
|
+
end
|
63
|
+
|
64
|
+
|
65
|
+
# Cleans the text
|
66
|
+
#
|
67
|
+
# Such as, removing extra spaces, normalising the linebreaks, etc.
|
68
|
+
#
|
69
|
+
# In default,
|
70
|
+
#
|
71
|
+
# * Paragraphs (more than 2 +\n+) are taken into account (one +\n+ between two): +preserve_paragraph=true+
|
72
|
+
# * Blank lines are truncated into one line with no white spaces: +boundary_style=lb_out*2(=$/*2)+
|
73
|
+
# * Consecutive white spaces are truncated into a single space: +sps_style=:truncate+
|
74
|
+
# * White spaces before or after a CJK character is deleted: +delete_asian_space=true+
|
75
|
+
# * Preceding white spaces in each line are deleted: +linehead_style=:delete+
|
76
|
+
# * Trailing white spaces in each line are deleted: +linetail_style=:delete+
|
77
|
+
# * Preceding line-breaks and white spaces at the beginning of the entire input string are truncated into one space: +firstsps_style=:truncate+
|
78
|
+
# * Trailing white spaces and line-breaks at the end of the entire input string are truncated into a single linebreak: +lastsps_style=:truncate+
|
79
|
+
#
|
80
|
+
# For a String with predominantly CJK characters, the following setting is recommended:
|
81
|
+
#
|
82
|
+
# * +lbs_style: :delete+
|
83
|
+
# * +delete_asian_space: true+ (Default)
|
84
|
+
#
|
85
|
+
# Note for the Symbols in optional arguments, the Symbol with the first character only is accepted,
|
86
|
+
# e.g., +:d+ instead of +:delete+ (nb., +:t2+ for +:truncate2+).
|
87
|
+
#
|
88
|
+
# For more detail, see the description.
|
89
|
+
#
|
90
|
+
# @param prt [PlainText:Part, String] {Part} or String to examine.
|
91
|
+
# @param preserve_paragraph: [Boolean] Paragraphs are taken into account if true (Def: False). In the input, paragraphs are defined to be separated with more than one +lb+ with potentially some space characters in between. Their output style is specified with +boundary_style+.
|
92
|
+
# @param boundary_style: [String, Symbol] One of +(:truncate|:truncate2|:delete|:none)+ or String. If String, the boundaries between paragraphs are replaced with this String (Def: +lb_out*2+). If +:truncate+, consecutive linebreaks and spaces are truncated into 2 linebreaks. +:truncate2+ are similar, but they are not truncated beyond 3 linebreaks (ie., up to 2 blank lines between Paragraphs). If +:none+, nothing is done about them. Unless :none, all the white spaces between linebreaks are deleted.
|
93
|
+
# @param lbs_style: [Symbol] One of +(:truncate|:delete|:none)+ (Def: +:truncate+). If :delete, all the linebreaks within paragraphs are deleted. +:truncate+ is meaningful only when +preserve_paragraph=false+ and consecutive linebreaks are truncated into 1 linebreak.
|
94
|
+
# @param sps_style: [Symbol] One of +(:truncate|:delete|:none)+ (Def: +:truncate+). If +:truncate+, the consecutive white spaces within paragraphs are truncated into a single white space. If :delete, they are deleted.
|
95
|
+
# @param lb_is_space: [Boolean] If true, a line-break, except those for the boundaries (unless +preserve_paragraph+ is false), is equivalent to a space (Def: False).
|
96
|
+
# @param delete_asian_space: [Boolean] Any spaces between, before, after Asian characters (but punctuation) are deleted, if true (Default).
|
97
|
+
# @param linehead_style: [Symbol] One of +(:truncate|:delete|:none)+ (Def: :delete). Determine how to handle consecutive white spaces at the beggining of each line.
|
98
|
+
# @param linetail_style: [Symbol] One of +(:truncate|:delete|:markdown|:none)+ (Def: :delete). Determine how to handle consecutive white spaces at the end of each line. If +:markdown:, two spaces at the end are preserved, whereas one or more than 2 consecutive spaces are deleted.
|
99
|
+
# @param firstsps_style: [Symbol, String] One of +(:truncate|:delete|:none)+ or String (Def: :default). If +:truncate+, any of white spaces and linebreaks at the very beginning of self, if exist, are truncated to a single white space (different from +lastsps_style+). If String, they are, even if not exists, replaced with the specified String (such as a linebreak). If +:delete+, they are deleted.
|
100
|
+
# @param lastsps_style: [Symbol, String] One of +(:truncate|:delete|:none|:linebreak)+ or String (Def: :truncate). If +:truncate+, any of white spaces and linebreaks at the very beginning of self, if exist, are truncated to a single white space (different from +firstsps_style+). If +:delete+, they are deleted. If String, they are, even if not exists, replaced with the specified String (such as a linebreak). If +:linebreak+, +lb_out+ is used as String (i.e., only 1 linebreak always exists).
|
101
|
+
# @param lb: [String] Linebreak character like +\n+ etc (Default: $/). If this is one of the standard line-breaks, irregular line-breaks (for example, existence of CR when only LF should be there) are corrected.
|
102
|
+
# @param lb_out: [String] Linebreak used for output (Default: +lb+)
|
103
|
+
# @return same as prt
|
104
|
+
#
|
105
|
+
def self.clean_text(
|
106
|
+
prt,
|
107
|
+
preserve_paragraph: true,
|
108
|
+
boundary_style: true, # If unspecified, will be replaced with lb_out * 2
|
109
|
+
lbs_style: :truncate,
|
110
|
+
lb_is_space: false,
|
111
|
+
sps_style: :truncate,
|
112
|
+
delete_asian_space: true,
|
113
|
+
linehead_style: :delete,
|
114
|
+
linetail_style: :delete,
|
115
|
+
firstsps_style: :delete,
|
116
|
+
lastsps_style: :truncate,
|
117
|
+
lb: $/,
|
118
|
+
lb_out: nil, # If unspecified, will be replaced with lb
|
119
|
+
is_debug: false
|
120
|
+
)
|
121
|
+
|
122
|
+
#isdebug = true if prt == "\n ab\n \ncd\n \n \n ef\n \n \n \n gh\n \n \n \n" #DEBUG
|
123
|
+
lb_out ||= lb # Output linebreak
|
124
|
+
boundary_style = lb_out*2 if true == boundary_style
|
125
|
+
boundary_style = "" if [:delete, :d].include? boundary_style
|
126
|
+
lastsps_style = lb_out if :linebreak == lastsps_style
|
127
|
+
|
128
|
+
if !prt.class.method_defined? :last_significant_element
|
129
|
+
# Construct a Part instance from the given String.
|
130
|
+
ret = ''
|
131
|
+
prt = prt.unicode_normalize
|
132
|
+
prt = normalize_lb(prt, "\n", lb_from: (DefLineBreaks.include?(lb) ? nil : lb)).dup
|
133
|
+
kwd = (["\r\n", "\r", "\n"].include?(lb) ? {} : { rules: /#{Regexp.quote lb}{2,}/})
|
134
|
+
prt = (preserve_paragraph ? Part.parse(prt, **kwd) : Part.new([prt]))
|
135
|
+
else
|
136
|
+
# If not preserve_paragraph, reconstructs it as a Part with a single Paragraph.
|
137
|
+
# Also, deepcopy is needed, as this method is destructive.
|
138
|
+
prt = (preserve_paragraph ? prt : Part.new([prt.join])).deepcopy
|
139
|
+
end
|
140
|
+
prt.squash_boundaryies! # Boundaries are squashed.
|
141
|
+
|
142
|
+
# Handles Boundary
|
143
|
+
clean_text_boundary!(prt, boundary_style: boundary_style)
|
144
|
+
|
145
|
+
# Handles linebreaks and spaces (within Paragraphs)
|
146
|
+
clean_text_lbs_sps!( prt,
|
147
|
+
lbs_style: lbs_style,
|
148
|
+
lb_is_space: lb_is_space,
|
149
|
+
sps_style: sps_style,
|
150
|
+
delete_asian_space: delete_asian_space,
|
151
|
+
)
|
152
|
+
# Handles the line head/tails.
|
153
|
+
clean_text_line_head_tail!( prt,
|
154
|
+
linehead_style: linehead_style,
|
155
|
+
linetail_style: linetail_style
|
156
|
+
)
|
157
|
+
|
158
|
+
# Handles the file head/tail.
|
159
|
+
clean_text_file_head_tail!( prt,
|
160
|
+
firstsps_style: firstsps_style,
|
161
|
+
lastsps_style: lastsps_style,
|
162
|
+
)
|
163
|
+
|
164
|
+
# Replaces the linebreaks to the specified one
|
165
|
+
prt.map{ |i| i.gsub!(/\n/m, lb_out) }
|
166
|
+
|
167
|
+
(ret ? prt.join : prt) # prt.to_s may be different from prt.join
|
168
|
+
end # def self.clean_text
|
169
|
+
|
170
|
+
# Module function of {#delete_spaces_bw_cjk_european}
|
171
|
+
#
|
172
|
+
# @param (see #delete_spaces_bw_cjk_european)
|
173
|
+
# @return as instr
|
174
|
+
def self.delete_spaces_bw_cjk_european(instr, *rest)
|
175
|
+
__call_inst_method__(:delete_spaces_bw_cjk_european, instr, *rest)
|
176
|
+
end
|
177
|
+
|
178
|
+
|
179
|
+
# Module function of {#head}
|
180
|
+
#
|
181
|
+
# The return String includes {PlainText} as Singleton.
|
182
|
+
#
|
183
|
+
# @param instr [String] String that is examined.
|
184
|
+
# @param (see #head)
|
185
|
+
# @return as instr
|
186
|
+
def self.head(instr, *rest, **k)
|
187
|
+
return PlainText.__call_inst_method__(:head, instr, *rest, **k)
|
188
|
+
end
|
189
|
+
|
190
|
+
|
191
|
+
# Module function of {#head_inverse}
|
192
|
+
#
|
193
|
+
# The return String includes {PlainText} as Singleton.
|
194
|
+
#
|
195
|
+
# @param instr [String] String that is examined.
|
196
|
+
# @param (see #head_inverse)
|
197
|
+
# @return as instr
|
198
|
+
def self.head_inverse(instr, *rest, **k)
|
199
|
+
return PlainText.__call_inst_method__(:head_inverse, instr, *rest, **k)
|
200
|
+
end
|
201
|
+
|
202
|
+
|
203
|
+
# Module function of {#normalize_lb}
|
204
|
+
#
|
205
|
+
# The return String includes {PlainText} as Singleton.
|
206
|
+
#
|
207
|
+
# @param instr [String] String that is examined.
|
208
|
+
# @param (see #normalize_lb)
|
209
|
+
# @return as instr
|
210
|
+
def self.normalize_lb(instr, *rest, **k)
|
211
|
+
return PlainText.__call_inst_method__(:normalize_lb, instr, *rest, **k)
|
212
|
+
end
|
213
|
+
|
214
|
+
|
215
|
+
# Module function of {#tail}
|
216
|
+
#
|
217
|
+
# The return String includes {PlainText} as Singleton.
|
218
|
+
#
|
219
|
+
# @param instr [String] String that is examined.
|
220
|
+
# @param (see #tail)
|
221
|
+
# @return as instr
|
222
|
+
def self.tail(instr, *rest, **k)
|
223
|
+
return PlainText.__call_inst_method__(:tail, instr, *rest, **k)
|
224
|
+
end
|
225
|
+
|
226
|
+
|
227
|
+
# Module function of {#tail_inverse}
|
228
|
+
#
|
229
|
+
# The return String includes {PlainText} as Singleton.
|
230
|
+
#
|
231
|
+
# @param instr [String] String that is examined.
|
232
|
+
# @param (see #tail_inverse)
|
233
|
+
# @return as instr
|
234
|
+
def self.tail_inverse(instr, *rest, **k)
|
235
|
+
return PlainText.__call_inst_method__(:tail_inverse, instr, *rest, **k)
|
236
|
+
end
|
237
|
+
|
238
|
+
|
239
|
+
##########
|
240
|
+
# Class methods (Private)
|
241
|
+
##########
|
242
|
+
|
243
|
+
# @param prt [PlainText:Part] (see Plaintext.clean_text#prt)
|
244
|
+
# @param boundary_style (see Plaintext.clean_text#boundary_style)
|
245
|
+
# @return [void]
|
246
|
+
#
|
247
|
+
# @see Plaintext.clean_text
|
248
|
+
def self.clean_text_boundary!( prt,
|
249
|
+
boundary_style: $/*2,
|
250
|
+
is_debug: false
|
251
|
+
)
|
252
|
+
|
253
|
+
# Boundary
|
254
|
+
case boundary_style
|
255
|
+
when String
|
256
|
+
prt.each_boundaries_with_index{|ec, i| ((i == prt.size - 1) && ec.empty?) ? ec : ec.replace(boundary_style)}
|
257
|
+
when :truncate, :t
|
258
|
+
prt.boundaries.each{|ec| ec.gsub!(/[[:blank:]]+/m, ""); ec.gsub!(/\n+{3,}/m, "\n\n")}
|
259
|
+
when :truncate2, :t2
|
260
|
+
prt.boundaries.each{|ec| ec.gsub!(/[[:blank:]]+/m, ""); ec.gsub!(/\n+{4,}/m, "\n\n\n")}
|
261
|
+
when :none, :n
|
262
|
+
# Do nothing
|
263
|
+
else
|
264
|
+
raise ArgumentError
|
265
|
+
end
|
266
|
+
end # self.clean_text_boundary!
|
267
|
+
private_class_method :clean_text_boundary!
|
268
|
+
|
269
|
+
# @param prt [PlainText:Part] (see Plaintext.clean_text#prt)
|
270
|
+
# @param lbs_style (see Plaintext.clean_text#lbs_style)
|
271
|
+
# @param sps_style (see Plaintext.clean_text#sps_style)
|
272
|
+
# @param lb_is_space (see Plaintext.clean_text#lb_is_space)
|
273
|
+
# @param delete_asian_space (see Plaintext.clean_text#delete_asian_space)
|
274
|
+
# @return [void]
|
275
|
+
#
|
276
|
+
# @see Plaintext.clean_text
|
277
|
+
def self.clean_text_lbs_sps!(
|
278
|
+
prt,
|
279
|
+
lbs_style: :truncate,
|
280
|
+
lb_is_space: false,
|
281
|
+
sps_style: :truncate,
|
282
|
+
delete_asian_space: true,
|
283
|
+
is_debug: false
|
284
|
+
)
|
285
|
+
|
286
|
+
# Linebreaks and spaces
|
287
|
+
[[lbs_style, "\n", "\n"], [sps_style, '[[:blank:]]', " "]].each do |ea|
|
288
|
+
# FROM TO FROM TO
|
289
|
+
case ea[0]
|
290
|
+
when :truncate, :t
|
291
|
+
prt.parts.each{|ec| ec.gsub!(/#{ea[1]}{2,}/m, ea[2])}
|
292
|
+
when :delete, :d
|
293
|
+
prt.parts.each{|ec| ec.gsub!(/#{ea[1]}/m, "")}
|
294
|
+
when :none, :n
|
295
|
+
else
|
296
|
+
raise ArgumentError
|
297
|
+
end
|
298
|
+
end
|
299
|
+
|
300
|
+
# Linebreaks become spaces
|
301
|
+
if lb_is_space
|
302
|
+
prt.parts.each{|ec| ec.gsub!(/\n/m, " ")}
|
303
|
+
prt.parts.each{|ec| ec.gsub!(/\n{2,}/m, "\n")} if lbs_style == :truncate
|
304
|
+
end
|
305
|
+
|
306
|
+
# Ignore spaces between, before, and after Asian characters.
|
307
|
+
if delete_asian_space
|
308
|
+
# prt.map_parts do |ea_p|
|
309
|
+
prt.parts.each do |ea_p|
|
310
|
+
PlainText.extend_this(ea_p)
|
311
|
+
ea_p.delete_spaces_bw_cjk_european! # Destructive change in prt.
|
312
|
+
end
|
313
|
+
end
|
314
|
+
end # self.clean_text_lbs_sps!
|
315
|
+
private_class_method :clean_text_lbs_sps!
|
316
|
+
|
317
|
+
# @param prt [PlainText:Part] (see Plaintext.clean_text#prt)
|
318
|
+
# @param linehead_style [Symbol, String] (see Plaintext.clean_text#linehead_style)
|
319
|
+
# @param linetail_style [Symbol, String] (see Plaintext.clean_text#linetail_style)
|
320
|
+
# @return [void]
|
321
|
+
#
|
322
|
+
# @see Plaintext.clean_text
|
323
|
+
def self.clean_text_line_head_tail!(
|
324
|
+
prt,
|
325
|
+
linehead_style: :delete,
|
326
|
+
linetail_style: :delete,
|
327
|
+
is_debug: false
|
328
|
+
)
|
329
|
+
|
330
|
+
# Head of each line
|
331
|
+
case linehead_style
|
332
|
+
when :truncate, :t
|
333
|
+
prt.parts.each{|ec| ec.gsub!(/^[[:blank:]]+/, " ")}
|
334
|
+
when :delete, :d
|
335
|
+
prt.parts.each{|ec| ec.gsub!(/^[[:blank:]]+/, "")}
|
336
|
+
when :none, :n
|
337
|
+
# Do nothing
|
338
|
+
else
|
339
|
+
raise ArgumentError, "Invalid linehead_style (#{linehead_style.inspect}) is specified."
|
340
|
+
end
|
341
|
+
|
342
|
+
# Tail of each line
|
343
|
+
case linetail_style
|
344
|
+
when :truncate, :t
|
345
|
+
prt.parts.each{|ec| ec.gsub!(/[[:blank:]]+$/, " ")}
|
346
|
+
when :delete, :d
|
347
|
+
prt.parts.each{|ec| ec.gsub!(/[[:blank:]]+$/, "")}
|
348
|
+
when :markdown, :m
|
349
|
+
# Two spaces are preserved
|
350
|
+
prt.parts.each{|ec| ec.gsub!(/(?:^|(?<![[:blank:]]))[[:blank:]]$/, "")} # A single space is deleted.
|
351
|
+
prt.parts.each{|ec| ec.gsub!(/[[:blank:]]* $/, " ")} # 3 or more spaces are truncated into 2 spaces, only IF the last two spaces are the ASCII spaces.
|
352
|
+
when :none, :n
|
353
|
+
# Do nothing
|
354
|
+
else
|
355
|
+
raise ArgumentError, "Invalid linetail_style (#{linetail_style.inspect}) is specified."
|
356
|
+
end
|
357
|
+
end # self.clean_text_line_head_tail!
|
358
|
+
private_class_method :clean_text_line_head_tail!
|
359
|
+
|
360
|
+
# @param prt [PlainText:Part] (see Plaintext.clean_text#prt)
|
361
|
+
# @param firstsps_style [Symbol, String] (see Plaintext.clean_text#firstsps_style)
|
362
|
+
# @param lastsps_style [Symbol, String] (see Plaintext.clean_text#lastsps_style)
|
363
|
+
# @return [void]
|
364
|
+
#
|
365
|
+
# @see Plaintext.clean_text
|
366
|
+
def self.clean_text_file_head_tail!(
|
367
|
+
prt,
|
368
|
+
firstsps_style: :delete,
|
369
|
+
lastsps_style: :truncate,
|
370
|
+
is_debug: false
|
371
|
+
)
|
372
|
+
|
373
|
+
# Handles the beginning of the given Part.
|
374
|
+
obj = prt.first_significant_element
|
375
|
+
# The first significant element is either Paragraph or Background. Either way,
|
376
|
+
# the beginning of the next element would not have any [[:space:]].
|
377
|
+
|
378
|
+
case firstsps_style
|
379
|
+
when String
|
380
|
+
obj.sub!(/\A[[:space:]]*/m, firstsps_style)
|
381
|
+
when :truncate, :t
|
382
|
+
# The initial blank lines, if exist, are truncated to a single " "
|
383
|
+
obj.sub!(/\A[[:space:]]+/m, " ")
|
384
|
+
when :delete, :d
|
385
|
+
# The initial blank lines and white spaces are deleted.
|
386
|
+
obj.sub!(/\A[[:space:]]*\n/m, "")
|
387
|
+
when :none, :n
|
388
|
+
# Do nothing
|
389
|
+
else
|
390
|
+
raise ArgumentError, "Invalid firstsps_style (#{firstsps_style.inspect}) is specified."
|
391
|
+
end
|
392
|
+
|
393
|
+
# Handles the end of the given Part.
|
394
|
+
ind = prt.last_significant_index
|
395
|
+
ind_para = (prt.index_para?(ind) ? ind : ind-1) # ind_para guaranteed to be for Paragraph
|
396
|
+
obj = Part.new(prt[ind_para, 2]).join # Handles as a String
|
397
|
+
case lastsps_style
|
398
|
+
when String
|
399
|
+
# The trailing spaces and line-breaks, even if onot exist, are replaced with a specified String.
|
400
|
+
changed = obj.sub!(/[[:space:]]*\z/m, lastsps_style)
|
401
|
+
when :truncate, :t
|
402
|
+
# The trailing spaces and line-breaks, if exist, are replaced with a single `linebreak_out`.
|
403
|
+
changed = obj.sub!(/[[:space:]]+\z/m, "\n")
|
404
|
+
when :delete, :d
|
405
|
+
# The trailing spaces and line-breaks are deleted.
|
406
|
+
changed = obj.sub!(/[[:space:]]+\z/m, "")
|
407
|
+
when :none, :n
|
408
|
+
# Do nothing
|
409
|
+
else
|
410
|
+
raise ArgumentError, "Invalid lastsps_style (#{lastsps_style.inspect}) is specified."
|
411
|
+
end
|
412
|
+
|
413
|
+
return nil if !changed
|
414
|
+
ma = /^#{Regexp.quote prt[ind_para]}/.match obj
|
415
|
+
if ma
|
416
|
+
prt[ind_para].replace ma[0]
|
417
|
+
prt[ind_para+1].replace ma.post_match
|
418
|
+
else
|
419
|
+
prt[ind_para].replace obj
|
420
|
+
prt[ind_para+1].replace ""
|
421
|
+
end
|
422
|
+
end # self.clean_text_file_head_tail!
|
423
|
+
private_class_method :clean_text_file_head_tail!
|
424
|
+
|
425
|
+
|
426
|
+
####################################################
|
427
|
+
# Instance methods
|
428
|
+
####################################################
|
429
|
+
|
430
|
+
# Count the number of characters
|
431
|
+
#
|
432
|
+
# See {PlainText#clean_text!} for the optional parameters. The defaults of a few of the optional parameters are different from {PlainText#clean_text!},
|
433
|
+
# such as the default for +lb_out+ is "\n" (so that a line-break is 1 byte in size).
|
434
|
+
# It is so that this method is more optimized for East-Asian (CJK) characters, given this method is most useful for CJK Strings,
|
435
|
+
# whereas, for European alphabets, counting the number of words, rather than characters as in this method, would be more standard.
|
436
|
+
#
|
437
|
+
# @param (see PlainText#clean_text!)
|
438
|
+
# @return [Integer]
|
439
|
+
def count_char(*rest,
|
440
|
+
lbs_style: :delete,
|
441
|
+
lastsps_style: :none,
|
442
|
+
lb_out: "\n",
|
443
|
+
**k)
|
444
|
+
PlainText.clean_text(self, *rest, lbs_style: lbs_style, lastsps_style: lastsps_style, lb_out: lb_out, **k).size
|
445
|
+
end
|
446
|
+
|
447
|
+
# Delete all the spaces between CJK and European characters or numbers.
|
448
|
+
#
|
449
|
+
# All the spaces between CJK and European characters, numbers or punctuations
|
450
|
+
# are deleted or converted into a specified replacement character.
|
451
|
+
# Or, in short, any spaces between, before, and after a CJK characters are deleted.
|
452
|
+
# If the return is non-nil, there is at least one match.
|
453
|
+
#
|
454
|
+
# @param repl [String] Replacement character (Default: "").
|
455
|
+
# @return [MatchData, NilClass] MatchData of (one of) the last match if there is a positive match, else nil.
|
456
|
+
def delete_spaces_bw_cjk_european!(repl="")
|
457
|
+
ret = gsub!(/(\p{Hiragana}|\p{Katakana}|[ー-]|[一-龠々]|\p{Han}|\p{Hangul})([[:blank:]]+)([[:upper:][:lower:][:digit:][:punct:]])/, '\1\3')
|
458
|
+
ret ||= gsub!(/([[:upper:][:lower:][:digit:][:punct:]])([[:blank:]]+)(\p{Hiragana}|\p{Katakana}|[ー-]|[一-龠々]|\p{Han}|\p{Hangul})/, '\1\3')
|
459
|
+
end
|
460
|
+
|
461
|
+
|
462
|
+
# Non-destructive version of {#delete_spaces_bw_cjk_european!}
|
463
|
+
#
|
464
|
+
# @param (see #delete_spaces_bw_cjk_european!)
|
465
|
+
# @return same class as self
|
466
|
+
def delete_spaces_bw_cjk_european(*rest)
|
467
|
+
newself = clone
|
468
|
+
newself.delete_spaces_bw_cjk_european!(*rest)
|
469
|
+
newself
|
470
|
+
end
|
471
|
+
|
472
|
+
|
473
|
+
# Destructive version of {#head}
|
474
|
+
#
|
475
|
+
# @param (see #head)
|
476
|
+
# @return [self]
|
477
|
+
def head!(*rest, **key)
|
478
|
+
replace(head(*rest, **key))
|
479
|
+
end
|
480
|
+
|
481
|
+
# Returns the first num lines (or characters, bytes) or before the last n-th line.
|
482
|
+
#
|
483
|
+
# If "byte" is specified as the return unit, the encoding is the same as self,
|
484
|
+
# though the encoding for the returned String may not be valid anymore.
|
485
|
+
# Note that it is probably the better practice to use +string[ 0..5 ]+ and +string#byteslice(0,5)+
|
486
|
+
# instead of this method for the units of "char" and "byte", respectively.
|
487
|
+
#
|
488
|
+
# For num, a negative number means counting from the last (e.g., -1 (lines, if unit is :line) means
|
489
|
+
# everything but the last 1 line, and -5 means everything but the last 5 lines), whereas 0 is forbidden.
|
490
|
+
# If a too big negative number is given, such as -9 for String of 2 lines, a null string is returned.
|
491
|
+
#
|
492
|
+
# If unit is :line, num can be Regexp, in which case the string of the lines up to the *first* line
|
493
|
+
# that matches the given Regexp is returned, where the process is based on the lines. For example,
|
494
|
+
# if num is +/ABC/+ (Regexp), String of the lines from the beginning up to the line that contains the character +"ABC"+ is returned.
|
495
|
+
#
|
496
|
+
# @param num_in [Integer, Regexp] Number (positive or negative, but not 0) of :unit to extract (Def: 10), or Regexp, which is valid only if unit is :line.
|
497
|
+
# @param unit: [Symbol, String] One of +:line+ (or +"-n"+), :+char+, +:byte+ (or +"-c"+)
|
498
|
+
# @param inclusive: [Boolean] read only when unit is :line. If inclusive (Default), the (entire) line that matches is included in the result.
|
499
|
+
# @param linebreak: [String] +\n+ etc (Default: +$/+), used when +unit==:line+ (Default)
|
500
|
+
# @return [String] as self
|
501
|
+
def head(num_in=DEF_HEADTAIL_N_LINES, unit: :line, inclusive: true, linebreak: $/)
|
502
|
+
if num_in.class.method_defined? :to_int
|
503
|
+
num = num_in.to_int
|
504
|
+
raise ArgumentError, "Non-positive num (#{num_in}) is given in #{__method__}" if num.to_int < 1
|
505
|
+
elsif num_in.class.method_defined? :named_captures
|
506
|
+
re_in = num_in
|
507
|
+
else
|
508
|
+
raise raise_typeerror(num_in, 'Integer or Range')
|
509
|
+
end
|
510
|
+
|
511
|
+
case unit
|
512
|
+
when :line, "-n"
|
513
|
+
# Regexp (for boundary)
|
514
|
+
return head_regexp(re_in, inclusive: inclusive, linebreak: linebreak) if re_in
|
515
|
+
|
516
|
+
# Integer (a number of lines)
|
517
|
+
ret = split(linebreak)[0..(num-1)].join(linebreak)
|
518
|
+
return ret if size <= ret.size # Specified line is larger than the original or the last NL is missing.
|
519
|
+
return(ret << linebreak) # NL is added to the tail as in the original.
|
520
|
+
when :char
|
521
|
+
return self[0..(num-1)]
|
522
|
+
when :byte, "-c"
|
523
|
+
return self.byteslice(0..(num-1))
|
524
|
+
else
|
525
|
+
raise ArgumentError, "Specified unit (#{unit}.inspect) is invalid in #{__method__}"
|
526
|
+
end
|
527
|
+
end
|
528
|
+
|
529
|
+
|
530
|
+
# Destructive version of {#head_inverse}
|
531
|
+
#
|
532
|
+
# @param (see #head_inverse)
|
533
|
+
# @return [self]
|
534
|
+
def head_inverse!(*rest, **key)
|
535
|
+
replace(head_inverse(*rest, **key))
|
536
|
+
end
|
537
|
+
|
538
|
+
# Inverse of head - returns the content except for the first num lines (or characters, bytes)
|
539
|
+
#
|
540
|
+
# @param (see #head)
|
541
|
+
# @return same as self
|
542
|
+
def head_inverse(*rest, **key)
|
543
|
+
s2 = head(*rest, **key)
|
544
|
+
(s2.size >= size) ? '' : self[s2.size..-1]
|
545
|
+
end
|
546
|
+
|
547
|
+
# Normalizes line-breaks
|
548
|
+
#
|
549
|
+
# All the line-breaks of self are converted into a new character or \n
|
550
|
+
# If the return is non-nil, self contains unexpected line-break characters
|
551
|
+
# for the OS.
|
552
|
+
#
|
553
|
+
# @param repl [String] Replacement character (Default: +$/+ which is +\n+ in UNIX).
|
554
|
+
# @param lb_from [String, Array, NilClass] Candidate line-break(s) (Defaut: +[CR+LF, CR, LF]+)
|
555
|
+
# @return [MatchData, NilClass] MatchData of the last match if there is non-$/ match, else nil.
|
556
|
+
def normalize_lb!(repl=$/, lb_from: nil)
|
557
|
+
ret = nil
|
558
|
+
lb_from ||= DefLineBreaks
|
559
|
+
lb_from = [lb_from].flatten
|
560
|
+
lb_from.each do |ea_lb|
|
561
|
+
gsub!(/#{ea_lb}/, repl) if ($/ != ea_lb) || ($/ == ea_lb && repl != ea_lb)
|
562
|
+
ret = $~ if ($/ != ea_lb) && !ret
|
563
|
+
end
|
564
|
+
ret
|
565
|
+
end
|
566
|
+
|
567
|
+
# Non-destructive version of {#normalize_lb!}
|
568
|
+
#
|
569
|
+
# @param (see #normalize_lb!)
|
570
|
+
# @return same class as self
|
571
|
+
def normalize_lb(*rest, **k)
|
572
|
+
newself = clone # must be clone (not dup) so Singlton methods, which may include this method, must be included.
|
573
|
+
newself.normalize_lb!(*rest, **k)
|
574
|
+
newself
|
575
|
+
end
|
576
|
+
|
577
|
+
|
578
|
+
# String#strip! for each line
|
579
|
+
#
|
580
|
+
# @param strip_head: [Boolean] if true (Default), spaces at each line head are removed.
|
581
|
+
# @param strip_tail: [Boolean] if true (Default), spaces at each line tail are removed (see +markdown+ option).
|
582
|
+
# @param markdown: [Boolean] if true (Def: false), a double space at each tail remains and +strip_head+ is forcibly false.
|
583
|
+
# @param linebreak: [String] +\n+ etc (Default: +$/+)
|
584
|
+
# @return [self, NilClass] nil if gsub! does not match at all, i.e., there are no spaces to remove.
|
585
|
+
def strip_at_lines!(strip_head: true, strip_tail: true, markdown: false, linebreak: $/)
|
586
|
+
strip_head = false if markdown
|
587
|
+
r1 = strip_at_lines_head!( linebreak: linebreak) if strip_head
|
588
|
+
r2 = strip_at_lines_tail!(markdown: markdown, linebreak: linebreak) if strip_tail
|
589
|
+
(r1 || r2) ? self : nil
|
590
|
+
end
|
591
|
+
|
592
|
+
# Non-destructive version of {#strip_at_lines!}
|
593
|
+
#
|
594
|
+
# @param (see #strip_at_lines!)
|
595
|
+
# @return same class as self
|
596
|
+
def strip_at_lines(*rest, **k)
|
597
|
+
newself = clone # must be clone (not dup) so Singlton methods, which may include this method, must be included.
|
598
|
+
newself.strip_at_lines!(*rest, **k)
|
599
|
+
newself
|
600
|
+
end
|
601
|
+
|
602
|
+
|
603
|
+
# String#strip! for each line but only for the head part (NOT tail part)
|
604
|
+
#
|
605
|
+
# @param linebreak: [String] "\n" etc (Default: $/)
|
606
|
+
# @return [self, NilClass] nil if gsub! does not match at all, i.e., there are no spaces to remove.
|
607
|
+
def strip_at_lines_head!(linebreak: $/)
|
608
|
+
lb_quo = Regexp.quote linebreak
|
609
|
+
gsub!(/(\A|#{lb_quo})[[:blank:]]+/m, '\1')
|
610
|
+
end
|
611
|
+
|
612
|
+
# Non-destructive version of {#strip_at_lines_head!}
|
613
|
+
#
|
614
|
+
# @param (see #strip_at_lines_head!)
|
615
|
+
# @return same class as self
|
616
|
+
def strip_at_lines_head(*rest, **k)
|
617
|
+
newself = clone # must be clone (not dup) so Singlton methods, which may include this method, must be included.
|
618
|
+
newself.strip_at_lines_head!(*rest, **k)
|
619
|
+
newself
|
620
|
+
end
|
621
|
+
|
622
|
+
# String#strip! for each line but only for the tail part (NOT head part)
|
623
|
+
#
|
624
|
+
# @param markdown: [Boolean] if true (Def: false), a double space at each tail remains.
|
625
|
+
# @param linebreak: [String] "\n" etc (Default: $/)
|
626
|
+
# @return [self, NilClass] nil if gsub! does not match at all, i.e., there are no spaces to remove.
|
627
|
+
def strip_at_lines_tail!(markdown: false, linebreak: $/)
|
628
|
+
lb_quo = Regexp.quote linebreak
|
629
|
+
return gsub!(/(?<=^|[^[:blank:]])[[:blank:]]+(#{lb_quo}|\z)/m, '\1') if ! markdown
|
630
|
+
|
631
|
+
r1 = gsub!(/(?<=^|[^[:blank:]])[[:blank:]]{3,}(#{lb_quo}|\z)/m, '\1')
|
632
|
+
r2 = gsub!(/(?<=^|[^[:blank:]])[[:blank:]](#{lb_quo}|\z)/m, '\1')
|
633
|
+
(r1 || r2) ? self : nil
|
634
|
+
end
|
635
|
+
|
636
|
+
# Non-destructive version of {#strip_at_lines_tail!}
|
637
|
+
#
|
638
|
+
# @param (see #strip_at_lines_tail!)
|
639
|
+
# @return same class as self
|
640
|
+
def strip_at_lines_tail(*rest, **k)
|
641
|
+
newself = clone # must be clone (not dup) so Singlton methods, which may include this method, must be included.
|
642
|
+
newself.strip_at_lines_tail!(*rest, **k)
|
643
|
+
newself
|
644
|
+
end
|
645
|
+
|
646
|
+
|
647
|
+
# Destructive version of {#tail}
|
648
|
+
#
|
649
|
+
# @param (see #tail)
|
650
|
+
# @return [self]
|
651
|
+
def tail!(*rest, **key)
|
652
|
+
replace(tail(*rest, **key))
|
653
|
+
end
|
654
|
+
|
655
|
+
# Returns the last num lines (or characters, bytes) or of and after the first n-th line.
|
656
|
+
#
|
657
|
+
# If "byte" is specified as the return unit, the encoding is the same as self,
|
658
|
+
# though the encoding for the returned String may not be valid anymore.
|
659
|
+
# Note that it is probably the better practice to use +string[ -5..-1 ]+ and +string#byteslice(-5,5)+
|
660
|
+
# instead of this method for the units of "char" and "byte", respectively.
|
661
|
+
#
|
662
|
+
# For num, a negative number means counting from the first (e.g., -1 [lines, if unit is :line] means
|
663
|
+
# everything but the first 1 line, and -5 means everything but the first 5 lines), whereas 0 is forbidden.
|
664
|
+
# If a too big negative number is given, such as -9 for String of 2 lines, a null string is returned.
|
665
|
+
#
|
666
|
+
# If unit is :line, num can be Regexp, in which case the string of the lines *after* the *first* line
|
667
|
+
# that matches the given Regexp is returned (*not* inclusive), where the process is based on the lines. For example,
|
668
|
+
# if num is /ABC/, String of the lines from the next line of the first line that contains the character "ABC"
|
669
|
+
# till the last one is returned. "The next line" means (1) the line immediately after the match
|
670
|
+
# if the matched string has the linebreak at the end, or (2) the line after the first linebreak after the matched string,
|
671
|
+
# where the trailing characters after the matched string to the linebreak (inclusive) is ignored.
|
672
|
+
#
|
673
|
+
# = Tips =
|
674
|
+
# To specify the *last* line that matches the Regexp, consider prefixing +(?:.*)+ with the option +m+,
|
675
|
+
# e.g., +/(?:.*)ABC/m+
|
676
|
+
#
|
677
|
+
# = Note for developers =
|
678
|
+
#
|
679
|
+
# The line that matches with Regexp has to be exclusive. Because otherwise to specify the last line
|
680
|
+
# that matches would be impossible in principle. For example, to specify the *last* line that matches +ABC+,
|
681
|
+
# the given regexp should be +/(?:.*)ABC/m+ (see the above Tips); in this case, if this matched line was inclusive,
|
682
|
+
# *all the lines from Line 1* would be included, which is most likely not what the caller wants.
|
683
|
+
#
|
684
|
+
# @param num_in [Integer, Regexp] Number (positive or negative, but not 0) of :unit to extract (Def: 10), or Regexp, which is valid only if unit is :line. If positive, the last num_in lines are returned. If negative, the lines from the num-in-th line from the head are returned. In short, calling this method as +tail(3)+ and +tail(-3)+ is similar to the UNIX commands "tail -n 3" and "tail -n +3", respectively.
|
685
|
+
# @param unit: [Symbol] One of :line (as in -n option), :char, :byte (-c option)
|
686
|
+
# @param inclusive: [Boolean] read only when unit is :line. If inclusive (Default), the (entire) line that matches is included in the result.
|
687
|
+
# @param linebreak: [String] +\n+ etc (Default: +$/+), used when unit==:line (Default)
|
688
|
+
# @return [String] as self
|
689
|
+
def tail(num_in=DEF_HEADTAIL_N_LINES, unit: :line, inclusive: true, linebreak: $/)
|
690
|
+
if num_in.class.method_defined? :to_int
|
691
|
+
num = num_in.to_int
|
692
|
+
raise ArgumentError, "num of zero is given in #{__method__}" if num == 0
|
693
|
+
num += 1 if num < 0
|
694
|
+
elsif num_in.class.method_defined? :named_captures
|
695
|
+
re_in = num_in
|
696
|
+
else
|
697
|
+
raise raise_typeerror(num_in, 'Integer or Range')
|
698
|
+
end
|
699
|
+
|
700
|
+
case unit
|
701
|
+
when :line, '-n'
|
702
|
+
# Regexp (for boundary)
|
703
|
+
return tail_regexp(re_in, inclusive: inclusive, linebreak: linebreak) if re_in
|
704
|
+
|
705
|
+
# Integer (a number of lines)
|
706
|
+
return tail_linenum(num_in, num, linebreak: linebreak)
|
707
|
+
when :char
|
708
|
+
num = 0 if num >= size && num_in > 0
|
709
|
+
return self[(-num)..-1]
|
710
|
+
when :byte, '-c'
|
711
|
+
num = 0 if num >= bytesize && num_in > 0
|
712
|
+
return self.byteslice((-num)..-1)
|
713
|
+
else
|
714
|
+
raise ArgumentError, "Specified unit (#{unit}.inspect) is invalid in #{__method__}"
|
715
|
+
end
|
716
|
+
end
|
717
|
+
|
718
|
+
# Destructive version of {#tail_inverse}
|
719
|
+
#
|
720
|
+
# @param (see #tail_inverse)
|
721
|
+
# @return [self]
|
722
|
+
def tail_inverse!(*rest, **key)
|
723
|
+
replace(tail_inverse(*rest, **key))
|
724
|
+
end
|
725
|
+
|
726
|
+
# Inverse of tail - returns the content except for the first num lines (or characters, bytes)
|
727
|
+
#
|
728
|
+
# @param (see #tail)
|
729
|
+
# @return same as self
|
730
|
+
def tail_inverse(*rest, **key)
|
731
|
+
s2 = tail(*rest, **key)
|
732
|
+
(s2.size >= size) ? '' : self[0..(size-s2.size-1)]
|
733
|
+
end
|
734
|
+
|
735
|
+
|
736
|
+
##########
|
737
|
+
# Instance methods (private)
|
738
|
+
##########
|
739
|
+
|
740
|
+
# head command with Regexp
|
741
|
+
#
|
742
|
+
# @param re_in [Regexp] Regexp to determine the boundary.
|
743
|
+
# @param inclusive: [Boolean] If true (Default), the (entire) line that matches re_in is included in the result. Else the entire line is excluded.
|
744
|
+
# @param linebreak: [String] +\n+ etc (Default: $/).
|
745
|
+
# @return [String] as self
|
746
|
+
# @see #head
|
747
|
+
def head_regexp(re_in, inclusive: true, linebreak: $/)
|
748
|
+
mat = re_in.match self
|
749
|
+
return self if !mat
|
750
|
+
if inclusive
|
751
|
+
return mat.pre_match+mat[0]+post_match_in_line(mat, linebreak: linebreak)[0]
|
752
|
+
else
|
753
|
+
return pre_match_in_line(mat.pre_match, linebreak: linebreak).pre_match
|
754
|
+
end
|
755
|
+
end
|
756
|
+
private :head_regexp
|
757
|
+
|
758
|
+
|
759
|
+
# Returns MatchData of the String at and before the first linebreak before the MatchData (inclusive)
|
760
|
+
#
|
761
|
+
# @param strpre [String] String of prematch of the last MatchData
|
762
|
+
# @param linebreak: [String] +\n+ etc (Default: $/)
|
763
|
+
# @return [MatchData] m[0] is the string after the last linebreak before the matched data (exclusive) and m.pre_match is all the lines before that.
|
764
|
+
def pre_match_in_line(strpre, linebreak: $/)
|
765
|
+
lb_quo = Regexp.quote linebreak
|
766
|
+
return /\z/.match(strpre) if /#{lb_quo}\z/ =~ strpre
|
767
|
+
/(?:^|(?<=#{lb_quo}))[^#{lb_quo}]*?\z/m.match strpre # non-greedy match and m option are required, as linebreak can be any characters.
|
768
|
+
end
|
769
|
+
private :pre_match_in_line
|
770
|
+
|
771
|
+
# Returns MatchData of the String after the MatchData to the linebreak (inclusive)
|
772
|
+
#
|
773
|
+
# @param mat [MatchData, String]
|
774
|
+
# @param strpost [String, nil] Post-match, if mat is String.
|
775
|
+
# @param linebreak: [String] +\n+ etc (Default: $/)
|
776
|
+
# @return [MatchData] m[0] is the string after matched data and up to the next first linebreak (inclusive) (or empty string if the last character(s) of matched data is the linebreak) and m.post_match is all the lines after that.
|
777
|
+
def post_match_in_line(mat, strpost=nil, linebreak: $/)
|
778
|
+
if mat.class.method_defined? :post_match
|
779
|
+
# mat is MatchData
|
780
|
+
strmatched, strpost = mat[0], mat.post_match
|
781
|
+
else
|
782
|
+
strmatched = mat.to_str rescue raise_typeerror(mat, 'String')
|
783
|
+
end
|
784
|
+
lb_quo = Regexp.quote linebreak
|
785
|
+
return /\A/.match if /#{lb_quo}\z/ =~ strmatched
|
786
|
+
/.*?#{lb_quo}/m.match strpost # non-greedy match and m option are required, as linebreak can be any characters.
|
787
|
+
end
|
788
|
+
private :post_match_in_line
|
789
|
+
|
790
|
+
# tail command with Regexp
|
791
|
+
#
|
792
|
+
# @param re_in [Regexp] Regexp to determine the boundary.
|
793
|
+
# @param inclusive: [Boolean] If true (Default), the (entire) line that matches re_in is included in the result. Else the entire line is excluded.
|
794
|
+
# @param linebreak: [String] +\n+ etc (Default: $/).
|
795
|
+
# @return [String] as self
|
796
|
+
# @see #tail
|
797
|
+
def tail_regexp(re_in, inclusive: true, linebreak: $/)
|
798
|
+
arst = split_with_delimiter re_in # PlainText::Split#split_with_delimiter (included in String)
|
799
|
+
return self.class.new("") if 0 == arst.size # Maybe self is a sub-class of String.
|
800
|
+
|
801
|
+
if inclusive
|
802
|
+
return pre_match_in_line( arst[0..-3].join, linebreak: linebreak)[0] + arst[-2] + arst[-1]
|
803
|
+
# Note: Even if (arst.size < 3), arst[0..-3] returns [].
|
804
|
+
else
|
805
|
+
return post_match_in_line(arst[-2], arst[-1], linebreak: linebreak).post_match
|
806
|
+
end
|
807
|
+
end
|
808
|
+
private :tail_regexp
|
809
|
+
|
810
|
+
|
811
|
+
# tail command based on the number of lines
|
812
|
+
#
|
813
|
+
# @param num_in [Integer] Original argument of the specified number of lines
|
814
|
+
# @param num [Integer] Converted integer for num_in
|
815
|
+
# @param linebreak: [String] +\n+ etc (Default: $/).
|
816
|
+
# @return [String] as self
|
817
|
+
# @see #tail
|
818
|
+
def tail_linenum(num_in, num, linebreak: $/)
|
819
|
+
arret = split(linebreak, -1) # -1 is specified to preserve the last linebreak(s).
|
820
|
+
return self.class.new("") if arret.empty?
|
821
|
+
|
822
|
+
lb_quo = Regexp.quote linebreak
|
823
|
+
if num_in > 0
|
824
|
+
num += 1 if /#{lb_quo}\z/ =~ self
|
825
|
+
num = 0 if num >= arret.size
|
826
|
+
end
|
827
|
+
ar = arret[(-num)..-1]
|
828
|
+
(ar.nil? || ar.empty?) ? self.class.new("") : ar.join(linebreak)
|
829
|
+
end
|
830
|
+
private :tail_linenum
|
831
|
+
|
832
|
+
|
833
|
+
end # module PlainText
|
834
|
+
|
835
|
+
require "plain_text/part"
|
836
|
+
require "plain_text/parse_rule"
|
837
|
+
require "plain_text/split"
|
838
|
+
require "plain_text/util"
|
839
|
+
|