plain_text 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/plain_text.rb ADDED
@@ -0,0 +1,839 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ # Utility methods for mainly line-based processing of String
4
+ #
5
+ # This module contains methods useful in processing a String object of a text file,
6
+ # that is, a String that contains an entire or a multiple-line part of a text file.
7
+ # The methods include normalizing the line-break codes, removing extra spaces from each line, etc.
8
+ # Many of the methods work on tha basis of a line. For example, {#head} and {#tail} methods
9
+ # work like the respective UNIX-shell commands, returning a specified line at the head/tail parts of self.
10
+ #
11
+ # Most methods in this module are meant to be included in String, except for a few module functions.
12
+ # It is however debatable whether it is a good practice to include a third-party module in the core class.
13
+ # This module contains a helper module function {PlainText.extend_this}, with which an object extends this module easily as Singleton if this module is not already included.
14
+ #
15
+ # A few methods in this module assume that {PlainText::Split} is included in String,
16
+ # which in default is the case, as soon as this file is read (by Ruby's require).
17
+ #
18
+ # @author Masa Sakano (Wise Babel Ltd)
19
+ #
20
+ module PlainText
21
+
22
+ # List of the default line breaks.
23
+ DefLineBreaks = [ "\r\n", "\n", "\r" ] # cf., Default in the present environment: $/
24
+
25
+ # Default number of lines to extract for {#head} and {#tail}
26
+ DEF_HEADTAIL_N_LINES = 10
27
+
28
+ # Call instance method as a Module function
29
+ #
30
+ # The return String includes {PlainText} as Singleton.
31
+ #
32
+ # @param method [Symbol] module method name
33
+ # @param instr [String] String that is examined.
34
+ # @return [#instr]
35
+ def self.__call_inst_method__(method, instr, *rest, **k)
36
+ newself = instr.clone
37
+ PlainText.extend_this(newself)
38
+ newself.public_send(method, *rest, **k)
39
+ end
40
+
41
+ # If the class of the obj does not "include" this module, do so in the singular class.
42
+ #
43
+ # @param obj [Object] Maybe String. For which a singular class def is run, if the condition is met.
44
+ # @return [TrueClass, NilClass] true if the singular class def is run. Else nil.
45
+ def self.extend_this(obj)
46
+ return nil if defined? obj.delete_spaces_bw_cjk_european!
47
+ obj.extend(PlainText)
48
+ true
49
+ end
50
+
51
+ # Module function of {#count_char}
52
+ #
53
+ # @param instr [String] String for which the number of chars is counted
54
+ # @param (see #count_char)
55
+ # @return [Integer]
56
+ def self.count_char(instr, *rest,
57
+ lbs_style: :delete,
58
+ lastsps_style: :delete,
59
+ lb_out: "\n",
60
+ **k)
61
+ clean_text(instr, *rest, lbs_style: lbs_style, lastsps_style: lastsps_style, lb_out: lb_out, **k).size
62
+ end
63
+
64
+
65
+ # Cleans the text
66
+ #
67
+ # Such as, removing extra spaces, normalising the linebreaks, etc.
68
+ #
69
+ # In default,
70
+ #
71
+ # * Paragraphs (more than 2 +\n+) are taken into account (one +\n+ between two): +preserve_paragraph=true+
72
+ # * Blank lines are truncated into one line with no white spaces: +boundary_style=lb_out*2(=$/*2)+
73
+ # * Consecutive white spaces are truncated into a single space: +sps_style=:truncate+
74
+ # * White spaces before or after a CJK character is deleted: +delete_asian_space=true+
75
+ # * Preceding white spaces in each line are deleted: +linehead_style=:delete+
76
+ # * Trailing white spaces in each line are deleted: +linetail_style=:delete+
77
+ # * Preceding line-breaks and white spaces at the beginning of the entire input string are truncated into one space: +firstsps_style=:truncate+
78
+ # * Trailing white spaces and line-breaks at the end of the entire input string are truncated into a single linebreak: +lastsps_style=:truncate+
79
+ #
80
+ # For a String with predominantly CJK characters, the following setting is recommended:
81
+ #
82
+ # * +lbs_style: :delete+
83
+ # * +delete_asian_space: true+ (Default)
84
+ #
85
+ # Note for the Symbols in optional arguments, the Symbol with the first character only is accepted,
86
+ # e.g., +:d+ instead of +:delete+ (nb., +:t2+ for +:truncate2+).
87
+ #
88
+ # For more detail, see the description.
89
+ #
90
+ # @param prt [PlainText:Part, String] {Part} or String to examine.
91
+ # @param preserve_paragraph: [Boolean] Paragraphs are taken into account if true (Def: False). In the input, paragraphs are defined to be separated with more than one +lb+ with potentially some space characters in between. Their output style is specified with +boundary_style+.
92
+ # @param boundary_style: [String, Symbol] One of +(:truncate|:truncate2|:delete|:none)+ or String. If String, the boundaries between paragraphs are replaced with this String (Def: +lb_out*2+). If +:truncate+, consecutive linebreaks and spaces are truncated into 2 linebreaks. +:truncate2+ are similar, but they are not truncated beyond 3 linebreaks (ie., up to 2 blank lines between Paragraphs). If +:none+, nothing is done about them. Unless :none, all the white spaces between linebreaks are deleted.
93
+ # @param lbs_style: [Symbol] One of +(:truncate|:delete|:none)+ (Def: +:truncate+). If :delete, all the linebreaks within paragraphs are deleted. +:truncate+ is meaningful only when +preserve_paragraph=false+ and consecutive linebreaks are truncated into 1 linebreak.
94
+ # @param sps_style: [Symbol] One of +(:truncate|:delete|:none)+ (Def: +:truncate+). If +:truncate+, the consecutive white spaces within paragraphs are truncated into a single white space. If :delete, they are deleted.
95
+ # @param lb_is_space: [Boolean] If true, a line-break, except those for the boundaries (unless +preserve_paragraph+ is false), is equivalent to a space (Def: False).
96
+ # @param delete_asian_space: [Boolean] Any spaces between, before, after Asian characters (but punctuation) are deleted, if true (Default).
97
+ # @param linehead_style: [Symbol] One of +(:truncate|:delete|:none)+ (Def: :delete). Determine how to handle consecutive white spaces at the beggining of each line.
98
+ # @param linetail_style: [Symbol] One of +(:truncate|:delete|:markdown|:none)+ (Def: :delete). Determine how to handle consecutive white spaces at the end of each line. If +:markdown:, two spaces at the end are preserved, whereas one or more than 2 consecutive spaces are deleted.
99
+ # @param firstsps_style: [Symbol, String] One of +(:truncate|:delete|:none)+ or String (Def: :default). If +:truncate+, any of white spaces and linebreaks at the very beginning of self, if exist, are truncated to a single white space (different from +lastsps_style+). If String, they are, even if not exists, replaced with the specified String (such as a linebreak). If +:delete+, they are deleted.
100
+ # @param lastsps_style: [Symbol, String] One of +(:truncate|:delete|:none|:linebreak)+ or String (Def: :truncate). If +:truncate+, any of white spaces and linebreaks at the very beginning of self, if exist, are truncated to a single white space (different from +firstsps_style+). If +:delete+, they are deleted. If String, they are, even if not exists, replaced with the specified String (such as a linebreak). If +:linebreak+, +lb_out+ is used as String (i.e., only 1 linebreak always exists).
101
+ # @param lb: [String] Linebreak character like +\n+ etc (Default: $/). If this is one of the standard line-breaks, irregular line-breaks (for example, existence of CR when only LF should be there) are corrected.
102
+ # @param lb_out: [String] Linebreak used for output (Default: +lb+)
103
+ # @return same as prt
104
+ #
105
+ def self.clean_text(
106
+ prt,
107
+ preserve_paragraph: true,
108
+ boundary_style: true, # If unspecified, will be replaced with lb_out * 2
109
+ lbs_style: :truncate,
110
+ lb_is_space: false,
111
+ sps_style: :truncate,
112
+ delete_asian_space: true,
113
+ linehead_style: :delete,
114
+ linetail_style: :delete,
115
+ firstsps_style: :delete,
116
+ lastsps_style: :truncate,
117
+ lb: $/,
118
+ lb_out: nil, # If unspecified, will be replaced with lb
119
+ is_debug: false
120
+ )
121
+
122
+ #isdebug = true if prt == "\n ab\n \ncd\n \n \n ef\n \n \n \n gh\n \n \n \n" #DEBUG
123
+ lb_out ||= lb # Output linebreak
124
+ boundary_style = lb_out*2 if true == boundary_style
125
+ boundary_style = "" if [:delete, :d].include? boundary_style
126
+ lastsps_style = lb_out if :linebreak == lastsps_style
127
+
128
+ if !prt.class.method_defined? :last_significant_element
129
+ # Construct a Part instance from the given String.
130
+ ret = ''
131
+ prt = prt.unicode_normalize
132
+ prt = normalize_lb(prt, "\n", lb_from: (DefLineBreaks.include?(lb) ? nil : lb)).dup
133
+ kwd = (["\r\n", "\r", "\n"].include?(lb) ? {} : { rules: /#{Regexp.quote lb}{2,}/})
134
+ prt = (preserve_paragraph ? Part.parse(prt, **kwd) : Part.new([prt]))
135
+ else
136
+ # If not preserve_paragraph, reconstructs it as a Part with a single Paragraph.
137
+ # Also, deepcopy is needed, as this method is destructive.
138
+ prt = (preserve_paragraph ? prt : Part.new([prt.join])).deepcopy
139
+ end
140
+ prt.squash_boundaryies! # Boundaries are squashed.
141
+
142
+ # Handles Boundary
143
+ clean_text_boundary!(prt, boundary_style: boundary_style)
144
+
145
+ # Handles linebreaks and spaces (within Paragraphs)
146
+ clean_text_lbs_sps!( prt,
147
+ lbs_style: lbs_style,
148
+ lb_is_space: lb_is_space,
149
+ sps_style: sps_style,
150
+ delete_asian_space: delete_asian_space,
151
+ )
152
+ # Handles the line head/tails.
153
+ clean_text_line_head_tail!( prt,
154
+ linehead_style: linehead_style,
155
+ linetail_style: linetail_style
156
+ )
157
+
158
+ # Handles the file head/tail.
159
+ clean_text_file_head_tail!( prt,
160
+ firstsps_style: firstsps_style,
161
+ lastsps_style: lastsps_style,
162
+ )
163
+
164
+ # Replaces the linebreaks to the specified one
165
+ prt.map{ |i| i.gsub!(/\n/m, lb_out) }
166
+
167
+ (ret ? prt.join : prt) # prt.to_s may be different from prt.join
168
+ end # def self.clean_text
169
+
170
+ # Module function of {#delete_spaces_bw_cjk_european}
171
+ #
172
+ # @param (see #delete_spaces_bw_cjk_european)
173
+ # @return as instr
174
+ def self.delete_spaces_bw_cjk_european(instr, *rest)
175
+ __call_inst_method__(:delete_spaces_bw_cjk_european, instr, *rest)
176
+ end
177
+
178
+
179
+ # Module function of {#head}
180
+ #
181
+ # The return String includes {PlainText} as Singleton.
182
+ #
183
+ # @param instr [String] String that is examined.
184
+ # @param (see #head)
185
+ # @return as instr
186
+ def self.head(instr, *rest, **k)
187
+ return PlainText.__call_inst_method__(:head, instr, *rest, **k)
188
+ end
189
+
190
+
191
+ # Module function of {#head_inverse}
192
+ #
193
+ # The return String includes {PlainText} as Singleton.
194
+ #
195
+ # @param instr [String] String that is examined.
196
+ # @param (see #head_inverse)
197
+ # @return as instr
198
+ def self.head_inverse(instr, *rest, **k)
199
+ return PlainText.__call_inst_method__(:head_inverse, instr, *rest, **k)
200
+ end
201
+
202
+
203
+ # Module function of {#normalize_lb}
204
+ #
205
+ # The return String includes {PlainText} as Singleton.
206
+ #
207
+ # @param instr [String] String that is examined.
208
+ # @param (see #normalize_lb)
209
+ # @return as instr
210
+ def self.normalize_lb(instr, *rest, **k)
211
+ return PlainText.__call_inst_method__(:normalize_lb, instr, *rest, **k)
212
+ end
213
+
214
+
215
+ # Module function of {#tail}
216
+ #
217
+ # The return String includes {PlainText} as Singleton.
218
+ #
219
+ # @param instr [String] String that is examined.
220
+ # @param (see #tail)
221
+ # @return as instr
222
+ def self.tail(instr, *rest, **k)
223
+ return PlainText.__call_inst_method__(:tail, instr, *rest, **k)
224
+ end
225
+
226
+
227
+ # Module function of {#tail_inverse}
228
+ #
229
+ # The return String includes {PlainText} as Singleton.
230
+ #
231
+ # @param instr [String] String that is examined.
232
+ # @param (see #tail_inverse)
233
+ # @return as instr
234
+ def self.tail_inverse(instr, *rest, **k)
235
+ return PlainText.__call_inst_method__(:tail_inverse, instr, *rest, **k)
236
+ end
237
+
238
+
239
+ ##########
240
+ # Class methods (Private)
241
+ ##########
242
+
243
+ # @param prt [PlainText:Part] (see Plaintext.clean_text#prt)
244
+ # @param boundary_style (see Plaintext.clean_text#boundary_style)
245
+ # @return [void]
246
+ #
247
+ # @see Plaintext.clean_text
248
+ def self.clean_text_boundary!( prt,
249
+ boundary_style: $/*2,
250
+ is_debug: false
251
+ )
252
+
253
+ # Boundary
254
+ case boundary_style
255
+ when String
256
+ prt.each_boundaries_with_index{|ec, i| ((i == prt.size - 1) && ec.empty?) ? ec : ec.replace(boundary_style)}
257
+ when :truncate, :t
258
+ prt.boundaries.each{|ec| ec.gsub!(/[[:blank:]]+/m, ""); ec.gsub!(/\n+{3,}/m, "\n\n")}
259
+ when :truncate2, :t2
260
+ prt.boundaries.each{|ec| ec.gsub!(/[[:blank:]]+/m, ""); ec.gsub!(/\n+{4,}/m, "\n\n\n")}
261
+ when :none, :n
262
+ # Do nothing
263
+ else
264
+ raise ArgumentError
265
+ end
266
+ end # self.clean_text_boundary!
267
+ private_class_method :clean_text_boundary!
268
+
269
+ # @param prt [PlainText:Part] (see Plaintext.clean_text#prt)
270
+ # @param lbs_style (see Plaintext.clean_text#lbs_style)
271
+ # @param sps_style (see Plaintext.clean_text#sps_style)
272
+ # @param lb_is_space (see Plaintext.clean_text#lb_is_space)
273
+ # @param delete_asian_space (see Plaintext.clean_text#delete_asian_space)
274
+ # @return [void]
275
+ #
276
+ # @see Plaintext.clean_text
277
+ def self.clean_text_lbs_sps!(
278
+ prt,
279
+ lbs_style: :truncate,
280
+ lb_is_space: false,
281
+ sps_style: :truncate,
282
+ delete_asian_space: true,
283
+ is_debug: false
284
+ )
285
+
286
+ # Linebreaks and spaces
287
+ [[lbs_style, "\n", "\n"], [sps_style, '[[:blank:]]', " "]].each do |ea|
288
+ # FROM TO FROM TO
289
+ case ea[0]
290
+ when :truncate, :t
291
+ prt.parts.each{|ec| ec.gsub!(/#{ea[1]}{2,}/m, ea[2])}
292
+ when :delete, :d
293
+ prt.parts.each{|ec| ec.gsub!(/#{ea[1]}/m, "")}
294
+ when :none, :n
295
+ else
296
+ raise ArgumentError
297
+ end
298
+ end
299
+
300
+ # Linebreaks become spaces
301
+ if lb_is_space
302
+ prt.parts.each{|ec| ec.gsub!(/\n/m, " ")}
303
+ prt.parts.each{|ec| ec.gsub!(/\n{2,}/m, "\n")} if lbs_style == :truncate
304
+ end
305
+
306
+ # Ignore spaces between, before, and after Asian characters.
307
+ if delete_asian_space
308
+ # prt.map_parts do |ea_p|
309
+ prt.parts.each do |ea_p|
310
+ PlainText.extend_this(ea_p)
311
+ ea_p.delete_spaces_bw_cjk_european! # Destructive change in prt.
312
+ end
313
+ end
314
+ end # self.clean_text_lbs_sps!
315
+ private_class_method :clean_text_lbs_sps!
316
+
317
+ # @param prt [PlainText:Part] (see Plaintext.clean_text#prt)
318
+ # @param linehead_style [Symbol, String] (see Plaintext.clean_text#linehead_style)
319
+ # @param linetail_style [Symbol, String] (see Plaintext.clean_text#linetail_style)
320
+ # @return [void]
321
+ #
322
+ # @see Plaintext.clean_text
323
+ def self.clean_text_line_head_tail!(
324
+ prt,
325
+ linehead_style: :delete,
326
+ linetail_style: :delete,
327
+ is_debug: false
328
+ )
329
+
330
+ # Head of each line
331
+ case linehead_style
332
+ when :truncate, :t
333
+ prt.parts.each{|ec| ec.gsub!(/^[[:blank:]]+/, " ")}
334
+ when :delete, :d
335
+ prt.parts.each{|ec| ec.gsub!(/^[[:blank:]]+/, "")}
336
+ when :none, :n
337
+ # Do nothing
338
+ else
339
+ raise ArgumentError, "Invalid linehead_style (#{linehead_style.inspect}) is specified."
340
+ end
341
+
342
+ # Tail of each line
343
+ case linetail_style
344
+ when :truncate, :t
345
+ prt.parts.each{|ec| ec.gsub!(/[[:blank:]]+$/, " ")}
346
+ when :delete, :d
347
+ prt.parts.each{|ec| ec.gsub!(/[[:blank:]]+$/, "")}
348
+ when :markdown, :m
349
+ # Two spaces are preserved
350
+ prt.parts.each{|ec| ec.gsub!(/(?:^|(?<![[:blank:]]))[[:blank:]]$/, "")} # A single space is deleted.
351
+ prt.parts.each{|ec| ec.gsub!(/[[:blank:]]* $/, " ")} # 3 or more spaces are truncated into 2 spaces, only IF the last two spaces are the ASCII spaces.
352
+ when :none, :n
353
+ # Do nothing
354
+ else
355
+ raise ArgumentError, "Invalid linetail_style (#{linetail_style.inspect}) is specified."
356
+ end
357
+ end # self.clean_text_line_head_tail!
358
+ private_class_method :clean_text_line_head_tail!
359
+
360
+ # @param prt [PlainText:Part] (see Plaintext.clean_text#prt)
361
+ # @param firstsps_style [Symbol, String] (see Plaintext.clean_text#firstsps_style)
362
+ # @param lastsps_style [Symbol, String] (see Plaintext.clean_text#lastsps_style)
363
+ # @return [void]
364
+ #
365
+ # @see Plaintext.clean_text
366
+ def self.clean_text_file_head_tail!(
367
+ prt,
368
+ firstsps_style: :delete,
369
+ lastsps_style: :truncate,
370
+ is_debug: false
371
+ )
372
+
373
+ # Handles the beginning of the given Part.
374
+ obj = prt.first_significant_element
375
+ # The first significant element is either Paragraph or Background. Either way,
376
+ # the beginning of the next element would not have any [[:space:]].
377
+
378
+ case firstsps_style
379
+ when String
380
+ obj.sub!(/\A[[:space:]]*/m, firstsps_style)
381
+ when :truncate, :t
382
+ # The initial blank lines, if exist, are truncated to a single " "
383
+ obj.sub!(/\A[[:space:]]+/m, " ")
384
+ when :delete, :d
385
+ # The initial blank lines and white spaces are deleted.
386
+ obj.sub!(/\A[[:space:]]*\n/m, "")
387
+ when :none, :n
388
+ # Do nothing
389
+ else
390
+ raise ArgumentError, "Invalid firstsps_style (#{firstsps_style.inspect}) is specified."
391
+ end
392
+
393
+ # Handles the end of the given Part.
394
+ ind = prt.last_significant_index
395
+ ind_para = (prt.index_para?(ind) ? ind : ind-1) # ind_para guaranteed to be for Paragraph
396
+ obj = Part.new(prt[ind_para, 2]).join # Handles as a String
397
+ case lastsps_style
398
+ when String
399
+ # The trailing spaces and line-breaks, even if onot exist, are replaced with a specified String.
400
+ changed = obj.sub!(/[[:space:]]*\z/m, lastsps_style)
401
+ when :truncate, :t
402
+ # The trailing spaces and line-breaks, if exist, are replaced with a single `linebreak_out`.
403
+ changed = obj.sub!(/[[:space:]]+\z/m, "\n")
404
+ when :delete, :d
405
+ # The trailing spaces and line-breaks are deleted.
406
+ changed = obj.sub!(/[[:space:]]+\z/m, "")
407
+ when :none, :n
408
+ # Do nothing
409
+ else
410
+ raise ArgumentError, "Invalid lastsps_style (#{lastsps_style.inspect}) is specified."
411
+ end
412
+
413
+ return nil if !changed
414
+ ma = /^#{Regexp.quote prt[ind_para]}/.match obj
415
+ if ma
416
+ prt[ind_para].replace ma[0]
417
+ prt[ind_para+1].replace ma.post_match
418
+ else
419
+ prt[ind_para].replace obj
420
+ prt[ind_para+1].replace ""
421
+ end
422
+ end # self.clean_text_file_head_tail!
423
+ private_class_method :clean_text_file_head_tail!
424
+
425
+
426
+ ####################################################
427
+ # Instance methods
428
+ ####################################################
429
+
430
+ # Count the number of characters
431
+ #
432
+ # See {PlainText#clean_text!} for the optional parameters. The defaults of a few of the optional parameters are different from {PlainText#clean_text!},
433
+ # such as the default for +lb_out+ is "\n" (so that a line-break is 1 byte in size).
434
+ # It is so that this method is more optimized for East-Asian (CJK) characters, given this method is most useful for CJK Strings,
435
+ # whereas, for European alphabets, counting the number of words, rather than characters as in this method, would be more standard.
436
+ #
437
+ # @param (see PlainText#clean_text!)
438
+ # @return [Integer]
439
+ def count_char(*rest,
440
+ lbs_style: :delete,
441
+ lastsps_style: :none,
442
+ lb_out: "\n",
443
+ **k)
444
+ PlainText.clean_text(self, *rest, lbs_style: lbs_style, lastsps_style: lastsps_style, lb_out: lb_out, **k).size
445
+ end
446
+
447
+ # Delete all the spaces between CJK and European characters or numbers.
448
+ #
449
+ # All the spaces between CJK and European characters, numbers or punctuations
450
+ # are deleted or converted into a specified replacement character.
451
+ # Or, in short, any spaces between, before, and after a CJK characters are deleted.
452
+ # If the return is non-nil, there is at least one match.
453
+ #
454
+ # @param repl [String] Replacement character (Default: "").
455
+ # @return [MatchData, NilClass] MatchData of (one of) the last match if there is a positive match, else nil.
456
+ def delete_spaces_bw_cjk_european!(repl="")
457
+ ret = gsub!(/(\p{Hiragana}|\p{Katakana}|[ー-]|[一-龠々]|\p{Han}|\p{Hangul})([[:blank:]]+)([[:upper:][:lower:][:digit:][:punct:]])/, '\1\3')
458
+ ret ||= gsub!(/([[:upper:][:lower:][:digit:][:punct:]])([[:blank:]]+)(\p{Hiragana}|\p{Katakana}|[ー-]|[一-龠々]|\p{Han}|\p{Hangul})/, '\1\3')
459
+ end
460
+
461
+
462
+ # Non-destructive version of {#delete_spaces_bw_cjk_european!}
463
+ #
464
+ # @param (see #delete_spaces_bw_cjk_european!)
465
+ # @return same class as self
466
+ def delete_spaces_bw_cjk_european(*rest)
467
+ newself = clone
468
+ newself.delete_spaces_bw_cjk_european!(*rest)
469
+ newself
470
+ end
471
+
472
+
473
+ # Destructive version of {#head}
474
+ #
475
+ # @param (see #head)
476
+ # @return [self]
477
+ def head!(*rest, **key)
478
+ replace(head(*rest, **key))
479
+ end
480
+
481
+ # Returns the first num lines (or characters, bytes) or before the last n-th line.
482
+ #
483
+ # If "byte" is specified as the return unit, the encoding is the same as self,
484
+ # though the encoding for the returned String may not be valid anymore.
485
+ # Note that it is probably the better practice to use +string[ 0..5 ]+ and +string#byteslice(0,5)+
486
+ # instead of this method for the units of "char" and "byte", respectively.
487
+ #
488
+ # For num, a negative number means counting from the last (e.g., -1 (lines, if unit is :line) means
489
+ # everything but the last 1 line, and -5 means everything but the last 5 lines), whereas 0 is forbidden.
490
+ # If a too big negative number is given, such as -9 for String of 2 lines, a null string is returned.
491
+ #
492
+ # If unit is :line, num can be Regexp, in which case the string of the lines up to the *first* line
493
+ # that matches the given Regexp is returned, where the process is based on the lines. For example,
494
+ # if num is +/ABC/+ (Regexp), String of the lines from the beginning up to the line that contains the character +"ABC"+ is returned.
495
+ #
496
+ # @param num_in [Integer, Regexp] Number (positive or negative, but not 0) of :unit to extract (Def: 10), or Regexp, which is valid only if unit is :line.
497
+ # @param unit: [Symbol, String] One of +:line+ (or +"-n"+), :+char+, +:byte+ (or +"-c"+)
498
+ # @param inclusive: [Boolean] read only when unit is :line. If inclusive (Default), the (entire) line that matches is included in the result.
499
+ # @param linebreak: [String] +\n+ etc (Default: +$/+), used when +unit==:line+ (Default)
500
+ # @return [String] as self
501
+ def head(num_in=DEF_HEADTAIL_N_LINES, unit: :line, inclusive: true, linebreak: $/)
502
+ if num_in.class.method_defined? :to_int
503
+ num = num_in.to_int
504
+ raise ArgumentError, "Non-positive num (#{num_in}) is given in #{__method__}" if num.to_int < 1
505
+ elsif num_in.class.method_defined? :named_captures
506
+ re_in = num_in
507
+ else
508
+ raise raise_typeerror(num_in, 'Integer or Range')
509
+ end
510
+
511
+ case unit
512
+ when :line, "-n"
513
+ # Regexp (for boundary)
514
+ return head_regexp(re_in, inclusive: inclusive, linebreak: linebreak) if re_in
515
+
516
+ # Integer (a number of lines)
517
+ ret = split(linebreak)[0..(num-1)].join(linebreak)
518
+ return ret if size <= ret.size # Specified line is larger than the original or the last NL is missing.
519
+ return(ret << linebreak) # NL is added to the tail as in the original.
520
+ when :char
521
+ return self[0..(num-1)]
522
+ when :byte, "-c"
523
+ return self.byteslice(0..(num-1))
524
+ else
525
+ raise ArgumentError, "Specified unit (#{unit}.inspect) is invalid in #{__method__}"
526
+ end
527
+ end
528
+
529
+
530
+ # Destructive version of {#head_inverse}
531
+ #
532
+ # @param (see #head_inverse)
533
+ # @return [self]
534
+ def head_inverse!(*rest, **key)
535
+ replace(head_inverse(*rest, **key))
536
+ end
537
+
538
+ # Inverse of head - returns the content except for the first num lines (or characters, bytes)
539
+ #
540
+ # @param (see #head)
541
+ # @return same as self
542
+ def head_inverse(*rest, **key)
543
+ s2 = head(*rest, **key)
544
+ (s2.size >= size) ? '' : self[s2.size..-1]
545
+ end
546
+
547
+ # Normalizes line-breaks
548
+ #
549
+ # All the line-breaks of self are converted into a new character or \n
550
+ # If the return is non-nil, self contains unexpected line-break characters
551
+ # for the OS.
552
+ #
553
+ # @param repl [String] Replacement character (Default: +$/+ which is +\n+ in UNIX).
554
+ # @param lb_from [String, Array, NilClass] Candidate line-break(s) (Defaut: +[CR+LF, CR, LF]+)
555
+ # @return [MatchData, NilClass] MatchData of the last match if there is non-$/ match, else nil.
556
+ def normalize_lb!(repl=$/, lb_from: nil)
557
+ ret = nil
558
+ lb_from ||= DefLineBreaks
559
+ lb_from = [lb_from].flatten
560
+ lb_from.each do |ea_lb|
561
+ gsub!(/#{ea_lb}/, repl) if ($/ != ea_lb) || ($/ == ea_lb && repl != ea_lb)
562
+ ret = $~ if ($/ != ea_lb) && !ret
563
+ end
564
+ ret
565
+ end
566
+
567
+ # Non-destructive version of {#normalize_lb!}
568
+ #
569
+ # @param (see #normalize_lb!)
570
+ # @return same class as self
571
+ def normalize_lb(*rest, **k)
572
+ newself = clone # must be clone (not dup) so Singlton methods, which may include this method, must be included.
573
+ newself.normalize_lb!(*rest, **k)
574
+ newself
575
+ end
576
+
577
+
578
+ # String#strip! for each line
579
+ #
580
+ # @param strip_head: [Boolean] if true (Default), spaces at each line head are removed.
581
+ # @param strip_tail: [Boolean] if true (Default), spaces at each line tail are removed (see +markdown+ option).
582
+ # @param markdown: [Boolean] if true (Def: false), a double space at each tail remains and +strip_head+ is forcibly false.
583
+ # @param linebreak: [String] +\n+ etc (Default: +$/+)
584
+ # @return [self, NilClass] nil if gsub! does not match at all, i.e., there are no spaces to remove.
585
+ def strip_at_lines!(strip_head: true, strip_tail: true, markdown: false, linebreak: $/)
586
+ strip_head = false if markdown
587
+ r1 = strip_at_lines_head!( linebreak: linebreak) if strip_head
588
+ r2 = strip_at_lines_tail!(markdown: markdown, linebreak: linebreak) if strip_tail
589
+ (r1 || r2) ? self : nil
590
+ end
591
+
592
+ # Non-destructive version of {#strip_at_lines!}
593
+ #
594
+ # @param (see #strip_at_lines!)
595
+ # @return same class as self
596
+ def strip_at_lines(*rest, **k)
597
+ newself = clone # must be clone (not dup) so Singlton methods, which may include this method, must be included.
598
+ newself.strip_at_lines!(*rest, **k)
599
+ newself
600
+ end
601
+
602
+
603
+ # String#strip! for each line but only for the head part (NOT tail part)
604
+ #
605
+ # @param linebreak: [String] "\n" etc (Default: $/)
606
+ # @return [self, NilClass] nil if gsub! does not match at all, i.e., there are no spaces to remove.
607
+ def strip_at_lines_head!(linebreak: $/)
608
+ lb_quo = Regexp.quote linebreak
609
+ gsub!(/(\A|#{lb_quo})[[:blank:]]+/m, '\1')
610
+ end
611
+
612
+ # Non-destructive version of {#strip_at_lines_head!}
613
+ #
614
+ # @param (see #strip_at_lines_head!)
615
+ # @return same class as self
616
+ def strip_at_lines_head(*rest, **k)
617
+ newself = clone # must be clone (not dup) so Singlton methods, which may include this method, must be included.
618
+ newself.strip_at_lines_head!(*rest, **k)
619
+ newself
620
+ end
621
+
622
+ # String#strip! for each line but only for the tail part (NOT head part)
623
+ #
624
+ # @param markdown: [Boolean] if true (Def: false), a double space at each tail remains.
625
+ # @param linebreak: [String] "\n" etc (Default: $/)
626
+ # @return [self, NilClass] nil if gsub! does not match at all, i.e., there are no spaces to remove.
627
+ def strip_at_lines_tail!(markdown: false, linebreak: $/)
628
+ lb_quo = Regexp.quote linebreak
629
+ return gsub!(/(?<=^|[^[:blank:]])[[:blank:]]+(#{lb_quo}|\z)/m, '\1') if ! markdown
630
+
631
+ r1 = gsub!(/(?<=^|[^[:blank:]])[[:blank:]]{3,}(#{lb_quo}|\z)/m, '\1')
632
+ r2 = gsub!(/(?<=^|[^[:blank:]])[[:blank:]](#{lb_quo}|\z)/m, '\1')
633
+ (r1 || r2) ? self : nil
634
+ end
635
+
636
+ # Non-destructive version of {#strip_at_lines_tail!}
637
+ #
638
+ # @param (see #strip_at_lines_tail!)
639
+ # @return same class as self
640
+ def strip_at_lines_tail(*rest, **k)
641
+ newself = clone # must be clone (not dup) so Singlton methods, which may include this method, must be included.
642
+ newself.strip_at_lines_tail!(*rest, **k)
643
+ newself
644
+ end
645
+
646
+
647
+ # Destructive version of {#tail}
648
+ #
649
+ # @param (see #tail)
650
+ # @return [self]
651
+ def tail!(*rest, **key)
652
+ replace(tail(*rest, **key))
653
+ end
654
+
655
+ # Returns the last num lines (or characters, bytes) or of and after the first n-th line.
656
+ #
657
+ # If "byte" is specified as the return unit, the encoding is the same as self,
658
+ # though the encoding for the returned String may not be valid anymore.
659
+ # Note that it is probably the better practice to use +string[ -5..-1 ]+ and +string#byteslice(-5,5)+
660
+ # instead of this method for the units of "char" and "byte", respectively.
661
+ #
662
+ # For num, a negative number means counting from the first (e.g., -1 [lines, if unit is :line] means
663
+ # everything but the first 1 line, and -5 means everything but the first 5 lines), whereas 0 is forbidden.
664
+ # If a too big negative number is given, such as -9 for String of 2 lines, a null string is returned.
665
+ #
666
+ # If unit is :line, num can be Regexp, in which case the string of the lines *after* the *first* line
667
+ # that matches the given Regexp is returned (*not* inclusive), where the process is based on the lines. For example,
668
+ # if num is /ABC/, String of the lines from the next line of the first line that contains the character "ABC"
669
+ # till the last one is returned. "The next line" means (1) the line immediately after the match
670
+ # if the matched string has the linebreak at the end, or (2) the line after the first linebreak after the matched string,
671
+ # where the trailing characters after the matched string to the linebreak (inclusive) is ignored.
672
+ #
673
+ # = Tips =
674
+ # To specify the *last* line that matches the Regexp, consider prefixing +(?:.*)+ with the option +m+,
675
+ # e.g., +/(?:.*)ABC/m+
676
+ #
677
+ # = Note for developers =
678
+ #
679
+ # The line that matches with Regexp has to be exclusive. Because otherwise to specify the last line
680
+ # that matches would be impossible in principle. For example, to specify the *last* line that matches +ABC+,
681
+ # the given regexp should be +/(?:.*)ABC/m+ (see the above Tips); in this case, if this matched line was inclusive,
682
+ # *all the lines from Line 1* would be included, which is most likely not what the caller wants.
683
+ #
684
+ # @param num_in [Integer, Regexp] Number (positive or negative, but not 0) of :unit to extract (Def: 10), or Regexp, which is valid only if unit is :line. If positive, the last num_in lines are returned. If negative, the lines from the num-in-th line from the head are returned. In short, calling this method as +tail(3)+ and +tail(-3)+ is similar to the UNIX commands "tail -n 3" and "tail -n +3", respectively.
685
+ # @param unit: [Symbol] One of :line (as in -n option), :char, :byte (-c option)
686
+ # @param inclusive: [Boolean] read only when unit is :line. If inclusive (Default), the (entire) line that matches is included in the result.
687
+ # @param linebreak: [String] +\n+ etc (Default: +$/+), used when unit==:line (Default)
688
+ # @return [String] as self
689
+ def tail(num_in=DEF_HEADTAIL_N_LINES, unit: :line, inclusive: true, linebreak: $/)
690
+ if num_in.class.method_defined? :to_int
691
+ num = num_in.to_int
692
+ raise ArgumentError, "num of zero is given in #{__method__}" if num == 0
693
+ num += 1 if num < 0
694
+ elsif num_in.class.method_defined? :named_captures
695
+ re_in = num_in
696
+ else
697
+ raise raise_typeerror(num_in, 'Integer or Range')
698
+ end
699
+
700
+ case unit
701
+ when :line, '-n'
702
+ # Regexp (for boundary)
703
+ return tail_regexp(re_in, inclusive: inclusive, linebreak: linebreak) if re_in
704
+
705
+ # Integer (a number of lines)
706
+ return tail_linenum(num_in, num, linebreak: linebreak)
707
+ when :char
708
+ num = 0 if num >= size && num_in > 0
709
+ return self[(-num)..-1]
710
+ when :byte, '-c'
711
+ num = 0 if num >= bytesize && num_in > 0
712
+ return self.byteslice((-num)..-1)
713
+ else
714
+ raise ArgumentError, "Specified unit (#{unit}.inspect) is invalid in #{__method__}"
715
+ end
716
+ end
717
+
718
+ # Destructive version of {#tail_inverse}
719
+ #
720
+ # @param (see #tail_inverse)
721
+ # @return [self]
722
+ def tail_inverse!(*rest, **key)
723
+ replace(tail_inverse(*rest, **key))
724
+ end
725
+
726
+ # Inverse of tail - returns the content except for the first num lines (or characters, bytes)
727
+ #
728
+ # @param (see #tail)
729
+ # @return same as self
730
+ def tail_inverse(*rest, **key)
731
+ s2 = tail(*rest, **key)
732
+ (s2.size >= size) ? '' : self[0..(size-s2.size-1)]
733
+ end
734
+
735
+
736
+ ##########
737
+ # Instance methods (private)
738
+ ##########
739
+
740
+ # head command with Regexp
741
+ #
742
+ # @param re_in [Regexp] Regexp to determine the boundary.
743
+ # @param inclusive: [Boolean] If true (Default), the (entire) line that matches re_in is included in the result. Else the entire line is excluded.
744
+ # @param linebreak: [String] +\n+ etc (Default: $/).
745
+ # @return [String] as self
746
+ # @see #head
747
+ def head_regexp(re_in, inclusive: true, linebreak: $/)
748
+ mat = re_in.match self
749
+ return self if !mat
750
+ if inclusive
751
+ return mat.pre_match+mat[0]+post_match_in_line(mat, linebreak: linebreak)[0]
752
+ else
753
+ return pre_match_in_line(mat.pre_match, linebreak: linebreak).pre_match
754
+ end
755
+ end
756
+ private :head_regexp
757
+
758
+
759
+ # Returns MatchData of the String at and before the first linebreak before the MatchData (inclusive)
760
+ #
761
+ # @param strpre [String] String of prematch of the last MatchData
762
+ # @param linebreak: [String] +\n+ etc (Default: $/)
763
+ # @return [MatchData] m[0] is the string after the last linebreak before the matched data (exclusive) and m.pre_match is all the lines before that.
764
+ def pre_match_in_line(strpre, linebreak: $/)
765
+ lb_quo = Regexp.quote linebreak
766
+ return /\z/.match(strpre) if /#{lb_quo}\z/ =~ strpre
767
+ /(?:^|(?<=#{lb_quo}))[^#{lb_quo}]*?\z/m.match strpre # non-greedy match and m option are required, as linebreak can be any characters.
768
+ end
769
+ private :pre_match_in_line
770
+
771
+ # Returns MatchData of the String after the MatchData to the linebreak (inclusive)
772
+ #
773
+ # @param mat [MatchData, String]
774
+ # @param strpost [String, nil] Post-match, if mat is String.
775
+ # @param linebreak: [String] +\n+ etc (Default: $/)
776
+ # @return [MatchData] m[0] is the string after matched data and up to the next first linebreak (inclusive) (or empty string if the last character(s) of matched data is the linebreak) and m.post_match is all the lines after that.
777
+ def post_match_in_line(mat, strpost=nil, linebreak: $/)
778
+ if mat.class.method_defined? :post_match
779
+ # mat is MatchData
780
+ strmatched, strpost = mat[0], mat.post_match
781
+ else
782
+ strmatched = mat.to_str rescue raise_typeerror(mat, 'String')
783
+ end
784
+ lb_quo = Regexp.quote linebreak
785
+ return /\A/.match if /#{lb_quo}\z/ =~ strmatched
786
+ /.*?#{lb_quo}/m.match strpost # non-greedy match and m option are required, as linebreak can be any characters.
787
+ end
788
+ private :post_match_in_line
789
+
790
+ # tail command with Regexp
791
+ #
792
+ # @param re_in [Regexp] Regexp to determine the boundary.
793
+ # @param inclusive: [Boolean] If true (Default), the (entire) line that matches re_in is included in the result. Else the entire line is excluded.
794
+ # @param linebreak: [String] +\n+ etc (Default: $/).
795
+ # @return [String] as self
796
+ # @see #tail
797
+ def tail_regexp(re_in, inclusive: true, linebreak: $/)
798
+ arst = split_with_delimiter re_in # PlainText::Split#split_with_delimiter (included in String)
799
+ return self.class.new("") if 0 == arst.size # Maybe self is a sub-class of String.
800
+
801
+ if inclusive
802
+ return pre_match_in_line( arst[0..-3].join, linebreak: linebreak)[0] + arst[-2] + arst[-1]
803
+ # Note: Even if (arst.size < 3), arst[0..-3] returns [].
804
+ else
805
+ return post_match_in_line(arst[-2], arst[-1], linebreak: linebreak).post_match
806
+ end
807
+ end
808
+ private :tail_regexp
809
+
810
+
811
+ # tail command based on the number of lines
812
+ #
813
+ # @param num_in [Integer] Original argument of the specified number of lines
814
+ # @param num [Integer] Converted integer for num_in
815
+ # @param linebreak: [String] +\n+ etc (Default: $/).
816
+ # @return [String] as self
817
+ # @see #tail
818
+ def tail_linenum(num_in, num, linebreak: $/)
819
+ arret = split(linebreak, -1) # -1 is specified to preserve the last linebreak(s).
820
+ return self.class.new("") if arret.empty?
821
+
822
+ lb_quo = Regexp.quote linebreak
823
+ if num_in > 0
824
+ num += 1 if /#{lb_quo}\z/ =~ self
825
+ num = 0 if num >= arret.size
826
+ end
827
+ ar = arret[(-num)..-1]
828
+ (ar.nil? || ar.empty?) ? self.class.new("") : ar.join(linebreak)
829
+ end
830
+ private :tail_linenum
831
+
832
+
833
+ end # module PlainText
834
+
835
+ require "plain_text/part"
836
+ require "plain_text/parse_rule"
837
+ require "plain_text/split"
838
+ require "plain_text/util"
839
+