plain_text 0.1 → 0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 52d4c007bf2d127b5fed9c1edd5df87597ee5ca4689818244ecb05bcb6d0a8f1
4
- data.tar.gz: 5af8e4489d714e272c8304911cbfdac18a736a49abf35d5143fd86ddf1d7e917
3
+ metadata.gz: 2c015ed947812371558456375c2933f0d03720082899f8c58c699419eda77f1b
4
+ data.tar.gz: fafc479d9bb492bd3b3ad140ec7a58d2cc0e7bc49dec4ad80c4111ad1f63e3df
5
5
  SHA512:
6
- metadata.gz: 422f9de75686466409ce8d9c819226313974d6d308aa66dbede7b808aed8230b5efb3bbf838151d00031a9157e9109c936e3af3bc2dc51e447bce9fee2bfd81d
7
- data.tar.gz: af8ca2904b51fb3ea3b822a49ae3e650ef5112af8f63d4b1df51ade1b31a0d083f712bf2b4f64f5eba17624c11264e6b7ffc8de8ce866541ef8d47a160be299c
6
+ metadata.gz: cb7d054e24cc85c64bbb556d4de30b3b54c9b51b409519d9b7f307fbe64dc05dc32e6e7cbeccc027b41c842a31ec5b489e60801b1c1c1f72e587157f62f38391
7
+ data.tar.gz: aef2b0ebd0c69f694c438cbf8d8e62d6d754d92c5d804553649c681d6c088bd9bb363197d9fb209b184aa49fb44ef5e733268e1d53a19bc7dfef260c86dee88c
data/ChangeLog CHANGED
@@ -1,3 +1,23 @@
1
+ -----
2
+ (Version: 0.2)
3
+ 2019-10-27 Masa Sakano
4
+ * Plain Text.clean_text
5
+ * Option name and default changed from `firstsps_style=:truncate` to `firstlbs_style=:delete`
6
+ * Default of Option `linehead_style` changed from :delete to : none
7
+ * Option `sps_style` now ignores the line head and tail in a new private class method `clean_text_sps!`
8
+ * Fixed bugs, including the one for Option choice `linetail_style: :markdown`
9
+ * New constant ParseRule::RuleEachLineStrip
10
+
11
+ -----
12
+ (Version: 0.2)
13
+ 2019-10-27 Masa Sakano
14
+ * Plain Text.clean_text
15
+ * Option name and default changed from `firstsps_style=:truncate` to `firstlbs_style=:delete`
16
+ * Default of Option `linehead_style` changed from :delete to : none
17
+ * Option `sps_style` now ignores the line head and tail in a new private class method `clean_text_sps!`
18
+ * Fixed bugs, including the one for Option choice `linetail_style: :markdown`
19
+ * New constant ParseRule::RuleEachLineStrip
20
+
1
21
  -----
2
22
  (Version: 0.1)
3
23
  2019-10-25 Masa Sakano
data/LICENSE.txt ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2012-2018 Scott Chacon and others
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.en.rdoc CHANGED
@@ -169,4 +169,5 @@ None.
169
169
 
170
170
  Author:: Masa Sakano < info a_t wisebabel dot com >
171
171
  Versions:: The versions of this package follow Semantic Versioning (2.0.0) http://semver.org/
172
+ License:: MIT
172
173
 
@@ -81,6 +81,10 @@ module PlainText
81
81
  # pt1.parts[0].parts[1] # => Paragraph::Title("Breaking!")
82
82
  # pt1.boundaries[1] # => Boundary("\n======\n")
83
83
  #
84
+ # @todo
85
+ # It would be smarter each instance (Regexp and Part) has its own "name"
86
+ # rather than this class holds @names as an Array.
87
+ #
84
88
  # @author Masa Sakano (Wise Babel Ltd)
85
89
  #
86
90
  class ParseRule
@@ -462,13 +466,23 @@ module PlainText
462
466
 
463
467
  def_lb_q = PlainText::DefLineBreaks.map{|i| Regexp.quote i}.join '|'
464
468
 
465
- # {ParseRule} instance to
469
+ # {ParseRule} instance to
466
470
  # split a String with 2 or more linebreaks (with potentially white-spaces in between).
467
471
  # This instance can be dup-ped and used normally. However, if it is clone-d, the cloned instance would be unmodifiable.
468
472
  RuleConsecutiveLbs = self.new(/((?:#{def_lb_q})(?:#{def_lb_q}|[[:blank:]])*(?:#{def_lb_q}))/, name: 'ConsecutiveLbs') # => /((?:\r\n|\n|\r){2,}/
469
473
  RuleConsecutiveLbs.freeze
470
474
  RuleConsecutiveLbs.rules.freeze
471
475
  RuleConsecutiveLbs.names.freeze
476
+
477
+ # {ParseRule} instance to
478
+ # split a String with 1 linebreak that is potentially sandwiched with white-spaces
479
+ # (or a whitespace(s) at the very beginning or end).
480
+ # Essentially, each line (after Ruby-strip-ped) is treated as Paragraph.
481
+ # This instance can be dup-ped and used normally. However, if it is clone-d, the cloned instance would be unmodifiable.
482
+ RuleEachLineStrip = self.new(/(\A[[:space:]]+|[[:space:]]*\n[[:space:]]*|[[:space:]]+\z)/, name: 'EachLineStrip') # => /((?:\r\n|\n|\r){2,}/
483
+ RuleEachLineStrip.freeze
484
+ RuleEachLineStrip.rules.freeze
485
+ RuleEachLineStrip.names.freeze
472
486
  end # class ParseRule
473
487
  end # module PlainText
474
488
 
data/lib/plain_text.rb CHANGED
@@ -55,6 +55,7 @@ module PlainText
55
55
  # @return [Integer]
56
56
  def self.count_char(instr, *rest,
57
57
  lbs_style: :delete,
58
+ linehead_style: :delete,
58
59
  lastsps_style: :delete,
59
60
  lb_out: "\n",
60
61
  **k)
@@ -72,9 +73,9 @@ module PlainText
72
73
  # * Blank lines are truncated into one line with no white spaces: +boundary_style=lb_out*2(=$/*2)+
73
74
  # * Consecutive white spaces are truncated into a single space: +sps_style=:truncate+
74
75
  # * White spaces before or after a CJK character is deleted: +delete_asian_space=true+
75
- # * Preceding white spaces in each line are deleted: +linehead_style=:delete+
76
+ # * Preceding white spaces in each line are preserved: +linehead_style=:none+
76
77
  # * Trailing white spaces in each line are deleted: +linetail_style=:delete+
77
- # * Preceding line-breaks and white spaces at the beginning of the entire input string are truncated into one space: +firstsps_style=:truncate+
78
+ # * Line-breaks at the beginning of the entire input string are deleted: +firstlbs_style=:delete+
78
79
  # * Trailing white spaces and line-breaks at the end of the entire input string are truncated into a single linebreak: +lastsps_style=:truncate+
79
80
  #
80
81
  # For a String with predominantly CJK characters, the following setting is recommended:
@@ -85,19 +86,25 @@ module PlainText
85
86
  # Note for the Symbols in optional arguments, the Symbol with the first character only is accepted,
86
87
  # e.g., +:d+ instead of +:delete+ (nb., +:t2+ for +:truncate2+).
87
88
  #
88
- # For more detail, see the description.
89
+ # For more detail, see the description of each command-line options.
90
+ #
91
+ # Note that for the case of traditional genko-yoshi-style Japanese texts
92
+ # with "jisage" for each new paragraph marking a new paragraph, probably
93
+ # the best way is to make your own Part instance to give to this method,
94
+ # where the rule for the Part should be something like:
95
+ # /(\A[[:blank:]]+|\n[[:space:]]+)/
89
96
  #
90
97
  # @param prt [PlainText:Part, String] {Part} or String to examine.
91
98
  # @param preserve_paragraph: [Boolean] Paragraphs are taken into account if true (Def: False). In the input, paragraphs are defined to be separated with more than one +lb+ with potentially some space characters in between. Their output style is specified with +boundary_style+.
92
99
  # @param boundary_style: [String, Symbol] One of +(:truncate|:truncate2|:delete|:none)+ or String. If String, the boundaries between paragraphs are replaced with this String (Def: +lb_out*2+). If +:truncate+, consecutive linebreaks and spaces are truncated into 2 linebreaks. +:truncate2+ are similar, but they are not truncated beyond 3 linebreaks (ie., up to 2 blank lines between Paragraphs). If +:none+, nothing is done about them. Unless :none, all the white spaces between linebreaks are deleted.
93
100
  # @param lbs_style: [Symbol] One of +(:truncate|:delete|:none)+ (Def: +:truncate+). If :delete, all the linebreaks within paragraphs are deleted. +:truncate+ is meaningful only when +preserve_paragraph=false+ and consecutive linebreaks are truncated into 1 linebreak.
94
- # @param sps_style: [Symbol] One of +(:truncate|:delete|:none)+ (Def: +:truncate+). If +:truncate+, the consecutive white spaces within paragraphs are truncated into a single white space. If :delete, they are deleted.
101
+ # @param sps_style: [Symbol] One of +(:truncate|:delete|:none)+ (Def: +:truncate+). If +:truncate+, the consecutive white spaces within paragraphs, *except* for those at the line-head or line-tail (which are controlled by +linehead_style+ and +linehead_style+, respectively), are truncated into a single white space. If :delete, they are deleted.
95
102
  # @param lb_is_space: [Boolean] If true, a line-break, except those for the boundaries (unless +preserve_paragraph+ is false), is equivalent to a space (Def: False).
96
103
  # @param delete_asian_space: [Boolean] Any spaces between, before, after Asian characters (but punctuation) are deleted, if true (Default).
97
- # @param linehead_style: [Symbol] One of +(:truncate|:delete|:none)+ (Def: :delete). Determine how to handle consecutive white spaces at the beggining of each line.
98
- # @param linetail_style: [Symbol] One of +(:truncate|:delete|:markdown|:none)+ (Def: :delete). Determine how to handle consecutive white spaces at the end of each line. If +:markdown:, two spaces at the end are preserved, whereas one or more than 2 consecutive spaces are deleted.
99
- # @param firstsps_style: [Symbol, String] One of +(:truncate|:delete|:none)+ or String (Def: :default). If +:truncate+, any of white spaces and linebreaks at the very beginning of self, if exist, are truncated to a single white space (different from +lastsps_style+). If String, they are, even if not exists, replaced with the specified String (such as a linebreak). If +:delete+, they are deleted.
100
- # @param lastsps_style: [Symbol, String] One of +(:truncate|:delete|:none|:linebreak)+ or String (Def: :truncate). If +:truncate+, any of white spaces and linebreaks at the very beginning of self, if exist, are truncated to a single white space (different from +firstsps_style+). If +:delete+, they are deleted. If String, they are, even if not exists, replaced with the specified String (such as a linebreak). If +:linebreak+, +lb_out+ is used as String (i.e., only 1 linebreak always exists).
104
+ # @param linehead_style: [Symbol] One of +(:truncate|:delete|:none)+ (Def: :none). Determine how to handle consecutive white spaces at the beggining of each line.
105
+ # @param linetail_style: [Symbol] One of +(:truncate|:delete|:markdown|:none)+ (Def: :delete). Determine how to handle consecutive white spaces at the end of each line. If +:markdown, 1 space is always deleted, and two or more spaces are truncated into two ASCII whitespaces *if* the last two spaces are ASCII whitespaces, or else untouched.
106
+ # @param firstlbs_style: [Symbol, String] One of +(:truncate|:delete|:none)+ or String (Def: :default). If +:truncate+, any linebreaks at the very beginning of self (and whitespaces in between), if exist, are truncated to a single linebreak. If String, they are, even if not exists, replaced with the specified String (such as a linebreak). If +:delete+, they are deleted. Note This option has nothing to do with the whitespaces at the beginning of the first significant line (hence the name of the option). Note if a (random) Part is given, this option only considers the first significant element of it.
107
+ # @param lastsps_style: [Symbol, String] One of +(:truncate|:delete|:none|:linebreak)+ or String (Def: :truncate). If +:truncate+, any of linebreaks *AND* white spaces at the tail of self, if exist, are truncated to a single linebreak. If +:delete+, they are deleted. If String, they are, even if not exists, replaced with the specified String (such as a linebreak, in which case +lb_out+ is used as String, i.e., it guarantees only 1 linebreak to exist at the end of the String). Note if a (random) Part is given, this option only considers the last significant element of it.
101
108
  # @param lb: [String] Linebreak character like +\n+ etc (Default: $/). If this is one of the standard line-breaks, irregular line-breaks (for example, existence of CR when only LF should be there) are corrected.
102
109
  # @param lb_out: [String] Linebreak used for output (Default: +lb+)
103
110
  # @return same as prt
@@ -110,16 +117,16 @@ module PlainText
110
117
  lb_is_space: false,
111
118
  sps_style: :truncate,
112
119
  delete_asian_space: true,
113
- linehead_style: :delete,
120
+ linehead_style: :none,
114
121
  linetail_style: :delete,
115
- firstsps_style: :delete,
122
+ firstlbs_style: :delete,
116
123
  lastsps_style: :truncate,
117
124
  lb: $/,
118
125
  lb_out: nil, # If unspecified, will be replaced with lb
119
126
  is_debug: false
120
127
  )
121
128
 
122
- #isdebug = true if prt == "\n ab\n \ncd\n \n \n ef\n \n \n \n gh\n \n \n \n" #DEBUG
129
+ isdebug = true if prt == "\n \n abc\n\n \ndef\n\n \n\n"
123
130
  lb_out ||= lb # Output linebreak
124
131
  boundary_style = lb_out*2 if true == boundary_style
125
132
  boundary_style = "" if [:delete, :d].include? boundary_style
@@ -128,7 +135,12 @@ module PlainText
128
135
  if !prt.class.method_defined? :last_significant_element
129
136
  # Construct a Part instance from the given String.
130
137
  ret = ''
131
- prt = prt.unicode_normalize
138
+ begin
139
+ prt = prt.unicode_normalize
140
+ rescue ArgumentError # (invalid byte sequence in UTF-8)
141
+ warn "The given String in (#{self.name}\##{__method__}) seems wrong."
142
+ raise
143
+ end
132
144
  prt = normalize_lb(prt, "\n", lb_from: (DefLineBreaks.include?(lb) ? nil : lb)).dup
133
145
  kwd = (["\r\n", "\r", "\n"].include?(lb) ? {} : { rules: /#{Regexp.quote lb}{2,}/})
134
146
  prt = (preserve_paragraph ? Part.parse(prt, **kwd) : Part.new([prt]))
@@ -148,6 +160,7 @@ module PlainText
148
160
  lb_is_space: lb_is_space,
149
161
  sps_style: sps_style,
150
162
  delete_asian_space: delete_asian_space,
163
+ is_debug: is_debug
151
164
  )
152
165
  # Handles the line head/tails.
153
166
  clean_text_line_head_tail!( prt,
@@ -157,8 +170,9 @@ module PlainText
157
170
 
158
171
  # Handles the file head/tail.
159
172
  clean_text_file_head_tail!( prt,
160
- firstsps_style: firstsps_style,
173
+ firstlbs_style: firstlbs_style,
161
174
  lastsps_style: lastsps_style,
175
+ is_debug: isdebug
162
176
  )
163
177
 
164
178
  # Replaces the linebreaks to the specified one
@@ -284,29 +298,29 @@ module PlainText
284
298
  )
285
299
 
286
300
  # Linebreaks and spaces
287
- [[lbs_style, "\n", "\n"], [sps_style, '[[:blank:]]', " "]].each do |ea|
288
- # FROM TO FROM TO
289
- case ea[0]
290
- when :truncate, :t
291
- prt.parts.each{|ec| ec.gsub!(/#{ea[1]}{2,}/m, ea[2])}
292
- when :delete, :d
293
- prt.parts.each{|ec| ec.gsub!(/#{ea[1]}/m, "")}
294
- when :none, :n
295
- else
296
- raise ArgumentError
297
- end
301
+ case lbs_style
302
+ when :truncate, :t
303
+ prt.parts.each{|ec| ec.gsub!(/\n{2,}/m, "\n")}
304
+ when :delete, :d
305
+ prt.parts.each{|ec| ec.gsub!(/\n/m, "")}
306
+ when :none, :n
307
+ # Does nothing
308
+ else
309
+ raise ArgumentError
298
310
  end
299
311
 
312
+ # Handles spaces in each line
313
+ clean_text_sps!(prt, sps_style: sps_style, is_debug: is_debug)
314
+
300
315
  # Linebreaks become spaces
301
316
  if lb_is_space
302
317
  prt.parts.each{|ec| ec.gsub!(/\n/m, " ")}
303
- prt.parts.each{|ec| ec.gsub!(/\n{2,}/m, "\n")} if lbs_style == :truncate
318
+ clean_text_sps!(prt, sps_style: sps_style, is_debug: is_debug) if sps_style == :truncate
304
319
  end
305
320
 
306
321
  # Ignore spaces between, before, and after Asian characters.
307
322
  if delete_asian_space
308
- # prt.map_parts do |ea_p|
309
- prt.parts.each do |ea_p|
323
+ prt.parts.each do |ea_p|
310
324
  PlainText.extend_this(ea_p)
311
325
  ea_p.delete_spaces_bw_cjk_european! # Destructive change in prt.
312
326
  end
@@ -322,7 +336,7 @@ module PlainText
322
336
  # @see Plaintext.clean_text
323
337
  def self.clean_text_line_head_tail!(
324
338
  prt,
325
- linehead_style: :delete,
339
+ linehead_style: :none,
326
340
  linetail_style: :delete,
327
341
  is_debug: false
328
342
  )
@@ -348,7 +362,7 @@ module PlainText
348
362
  when :markdown, :m
349
363
  # Two spaces are preserved
350
364
  prt.parts.each{|ec| ec.gsub!(/(?:^|(?<![[:blank:]]))[[:blank:]]$/, "")} # A single space is deleted.
351
- prt.parts.each{|ec| ec.gsub!(/[[:blank:]]* $/, " ")} # 3 or more spaces are truncated into 2 spaces, only IF the last two spaces are the ASCII spaces.
365
+ prt.parts.each{|ec| ec.gsub!(/[[:blank:]]+ $/, " ")} # 3 or more spaces are truncated into 2 spaces, only IF the last two spaces are the ASCII spaces.
352
366
  when :none, :n
353
367
  # Do nothing
354
368
  else
@@ -358,36 +372,47 @@ module PlainText
358
372
  private_class_method :clean_text_line_head_tail!
359
373
 
360
374
  # @param prt [PlainText:Part] (see Plaintext.clean_text#prt)
361
- # @param firstsps_style [Symbol, String] (see Plaintext.clean_text#firstsps_style)
375
+ # @param firstlbs_style [Symbol, String] (see Plaintext.clean_text#firstlbs_style)
362
376
  # @param lastsps_style [Symbol, String] (see Plaintext.clean_text#lastsps_style)
363
377
  # @return [void]
364
378
  #
365
379
  # @see Plaintext.clean_text
366
380
  def self.clean_text_file_head_tail!(
367
381
  prt,
368
- firstsps_style: :delete,
382
+ firstlbs_style: :delete,
369
383
  lastsps_style: :truncate,
370
384
  is_debug: false
371
385
  )
372
386
 
373
387
  # Handles the beginning of the given Part.
374
- obj = prt.first_significant_element
375
- # The first significant element is either Paragraph or Background. Either way,
376
- # the beginning of the next element would not have any [[:space:]].
388
+ obj = prt.first_significant_element || return
389
+ # The first significant element is either Paragraph or Background.
390
+ # obj may be nil.
377
391
 
378
- case firstsps_style
392
+ case firstlbs_style
379
393
  when String
380
- obj.sub!(/\A[[:space:]]*/m, firstsps_style)
394
+ # This assumes the first Background is not
395
+ # (1) containing any non-space characters,
396
+ # (2) white-spaces only AND the first Paragraph starts from a linebreak.
397
+ # You can assume it as long as String is the original input.
398
+ # However, if the input is Part, anything can be possible, like
399
+ # first multiple Backgrounds contain a linebreak for each, each of which
400
+ # follows an empty Paragraph...
401
+ # The thing is, if String is always returned, it is much easier
402
+ # to process after Part#join. However, the method may return Part.
403
+ # Therefore, you cannot do it!
404
+ # I explain it in the document in {self.clean_text}.
405
+ obj.sub!(/\A([[:space:]]*\n)?/m, firstlbs_style)
381
406
  when :truncate, :t
382
- # The initial blank lines, if exist, are truncated to a single " "
383
- obj.sub!(/\A[[:space:]]+/m, " ")
407
+ # The initial blank lines, if exist, are truncated to a single "\n"
408
+ obj.sub!(/\A[[:space:]]*\n/m, "\n")
384
409
  when :delete, :d
385
- # The initial blank lines and white spaces are deleted.
410
+ # The initial blank lines are deleted.
386
411
  obj.sub!(/\A[[:space:]]*\n/m, "")
387
412
  when :none, :n
388
413
  # Do nothing
389
414
  else
390
- raise ArgumentError, "Invalid firstsps_style (#{firstsps_style.inspect}) is specified."
415
+ raise ArgumentError, "Invalid firstlbs_style (#{firstlbs_style.inspect}) is specified."
391
416
  end
392
417
 
393
418
  # Handles the end of the given Part.
@@ -423,6 +448,43 @@ module PlainText
423
448
  private_class_method :clean_text_file_head_tail!
424
449
 
425
450
 
451
+ # Handles spaces within Paragraphs
452
+ #
453
+ # uses Part to transform a Paragraph into a Part
454
+ #
455
+ # @param prt [PlainText:Part] (see Plaintext.clean_text#prt)
456
+ # @param sps_style (see Plaintext.clean_text#sps_style)
457
+ # @return [void]
458
+ #
459
+ # @see Plaintext.clean_text
460
+ def self.clean_text_sps!(
461
+ prt,
462
+ sps_style: :truncate,
463
+ is_debug: false
464
+ )
465
+
466
+ prt.parts.each do |e_pa|
467
+ ru = ParseRule
468
+ # Each line treated as a Paragraph, and [[:space:]]+ between them as a Boundary.
469
+ # Then, to work on anything within a line except for line-head/tail is easy.
470
+ prt_para = Part.parse(e_pa, rule: ParseRule::RuleEachLineStrip).map_parts { |e_li|
471
+ case sps_style
472
+ when :truncate, :t
473
+ e_li.gsub(/[[:blank:]]{2,}/m, " ")
474
+ when :delete, :d
475
+ e_li.gsub(/[[:blank:]]+/m, "")
476
+ when :none, :n
477
+ e_li
478
+ else
479
+ raise ArgumentError
480
+ end
481
+ } # map_parts
482
+ e_pa.replace prt_para.join
483
+ end
484
+ end
485
+ private_class_method :clean_text_sps!
486
+
487
+
426
488
  ####################################################
427
489
  # Instance methods
428
490
  ####################################################
@@ -438,6 +500,7 @@ module PlainText
438
500
  # @return [Integer]
439
501
  def count_char(*rest,
440
502
  lbs_style: :delete,
503
+ linehead_style: :delete,
441
504
  lastsps_style: :none,
442
505
  lb_out: "\n",
443
506
  **k)
data/plain_text.gemspec CHANGED
@@ -1,17 +1,19 @@
1
1
  # -*- encoding: utf-8 -*-
2
2
 
3
3
  require 'rake'
4
+ require 'date'
4
5
 
5
6
  Gem::Specification.new do |s|
6
- s.name = %q{plain_text}
7
- s.version = "0.1"
7
+ s.name = %q{plain_text}.sub(/.*/){|c| (c == File.basename(Dir.pwd)) ? c : raise("ERROR: s.name=(#{c}) in gemspec seems wrong!")}
8
+ s.version = "0.2"
8
9
  # s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
9
10
  %w(countchar).each do |f|
10
- s.executables << f
11
+ path = s.bindir+'/'+f
12
+ File.executable?(path) ? s.executables << f : raise("ERROR: Executable (#{path}) is not executable!")
11
13
  end
12
14
  s.bindir = 'bin'
13
15
  s.authors = ["Masa Sakano"]
14
- s.date = %q{2019-10-25}
16
+ s.date = %q{2019-10-27}.sub(/.*/){|c| (Date.parse(c) == Date.today) ? c : raise("ERROR: s.date=(#{c}) is not today!")}
15
17
  s.summary = %q{Module to handle Plain-Text}
16
18
  s.description = %q{This module provides utility functions and methods to handle plain text, classes Part/Paragraph/Boundary to represent the logical structure of a document and ParseRule to describe the rules to parse plain text to produce a Part-type Ruby instance.}
17
19
  # s.email = %q{abc@example.com}
@@ -77,9 +77,9 @@ class TestUnitPlainText < MiniTest::Test
77
77
  def test_clean_text03
78
78
  assert_raises(ArgumentError){ PT.clean_text("abc", boundary_style: nil) }
79
79
  s1 = "abc \n \n def\n\n"
80
- s20 = "abc\n \ndef\n"
81
- s21 = "abcXYZdefXYZ"
82
- s22 = "abc\n\ndef\n"
80
+ s20 = "abc\n \n def\n"
81
+ s21 = "abcXYZ defXYZ"
82
+ s22 = "abc\n\n def\n"
83
83
  sr = PT.clean_text(s1, boundary_style: :none)
84
84
  assert_equal s20, sr, prerr(s20, sr)
85
85
  sr = PT.clean_text(s1, boundary_style: "XYZ")
@@ -105,36 +105,46 @@ class TestUnitPlainText < MiniTest::Test
105
105
  assert_equal s22, sr, prerr(s22, sr)
106
106
 
107
107
  s3 = "\nabc\n\ndef"
108
- s41 = " abc\n\ndefTT"
108
+ s41 = "\nabc\n\ndefTT"
109
109
  s42 = "\nabc\n\ndef"
110
- sr = PT.clean_text(s3, firstsps_style: :truncate, lastsps_style: 'TT')
110
+ sr = PT.clean_text(s3, firstlbs_style: :truncate, lastsps_style: 'TT')
111
111
  assert_equal s41, sr, prerr(s41, sr)
112
- sr = PT.clean_text(s3, firstsps_style: :none, lastsps_style: :delete)
112
+ sr = PT.clean_text(s3, firstlbs_style: :none, lastsps_style: :delete)
113
113
  assert_equal s42, sr, prerr(s42, sr)
114
114
  end
115
115
 
116
116
  def test_clean_text_boundary01
117
- assert_raises(ArgumentError){ PT.clean_text("abc", boundary_style: nil) }
118
117
  s1 = "\n ab\n \ncd\n \n \n ef\n \n \n \n gh\n \n \n \n"
119
- s21 = " ab\n \ncd\n \n \n ef\n \n \n \n gh\n"
118
+ s21 = "\n ab\n \ncd\n \n \n ef\n \n \n \n gh\n"
120
119
  s22 = "\n ab\n\ncd\n\n ef\n\n gh\n\n"
121
- s23 = "\n ab\n\ncd\n\n\n ef\n\n\n gh\n\n\n"
122
- sr = PT.clean_text(s1, boundary_style: :n, lastsps_style: :t, linehead_style: :n, firstsps_style: :t, sps_style: :n)
120
+ s23 = "\n ab\n\ncd\n\n\n ef\n\n\n gh\n\n\n"
121
+ sr = PT.clean_text(s1, boundary_style: :n, lastsps_style: :t, linehead_style: :n, firstlbs_style: :t, sps_style: :n)
123
122
  assert_equal s21, sr, prerr(s21, sr)
124
- sr = PT.clean_text(s1, boundary_style: :t, lastsps_style: :n, linehead_style: :n, firstsps_style: :n, sps_style: :n)
123
+ sr = PT.clean_text(s1, boundary_style: :t, lastsps_style: :n, linehead_style: :n, firstlbs_style: :n, sps_style: :n)
125
124
  assert_equal s22, sr, prerr(s22, sr)
126
- sr = PT.clean_text(s1, boundary_style: :t2, lastsps_style: :n, linehead_style: :t, firstsps_style: :n, sps_style: :n)
125
+ sr = PT.clean_text(s1, boundary_style: :t2, lastsps_style: :n, linehead_style: :t, firstlbs_style: :n, sps_style: :n)
127
126
  assert_equal s23, sr, prerr(s23, sr)
128
127
  end
129
128
 
129
+ def test_clean_text_markdown01
130
+ s0 = "\n ab \n \n cd \n \n\n ef \n \ngh \t \n\nij \t \n\nkl \u3000 \n\nmn"
131
+ s21 = "\n ab\n\n cd \n\n ef \n\ngh \t \n\nij \n\nkl \u3000 \n\nmn"
132
+ s22 = " ab\n\n cd \n\n ef \n\ngh \t \n\nij \n\nkl \u3000 \n\nmn"
133
+
134
+ sr = PT.clean_text(s0, linehead_style: :n, linetail_style: :m, firstlbs_style: :none)
135
+ assert_equal s21, sr, prerr(s21, sr)
136
+ sr = PT.clean_text(s0, linehead_style: :n, linetail_style: :m, firstlbs_style: :delete)
137
+ assert_equal s22, sr, prerr(s22, sr)
138
+ end
139
+
130
140
  def test_clean_text_part01
131
141
  s0 = "\n \n abc\n\n \ndef\n\n \n\n"
132
- s1 = "TTabc\n\ndef\n"
142
+ s1 = "TT abc\n\ndef\n"
133
143
  p00 = PT::Part.parse s0
134
144
  p0 = PT::Part.parse s0
135
- sr = PT.clean_text(s0, firstsps_style: 'TT')
145
+ sr = PT.clean_text(s0, firstlbs_style: 'TT')
136
146
  assert_equal s1, sr, prerr(s1, sr)
137
- sr = PT.clean_text(p0, firstsps_style: 'TT')
147
+ sr = PT.clean_text(p0, firstlbs_style: 'TT')
138
148
  assert_equal PT::Part, sr.class
139
149
  assert_equal s1, sr.join
140
150
  assert_equal p00, p0, prerr(p00, p0) # p0 is deepcopied?
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: plain_text
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.1'
4
+ version: '0.2'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Masa Sakano
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-10-25 00:00:00.000000000 Z
11
+ date: 2019-10-27 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: This module provides utility functions and methods to handle plain text,
14
14
  classes Part/Paragraph/Boundary to represent the logical structure of a document
@@ -23,6 +23,7 @@ extra_rdoc_files:
23
23
  files:
24
24
  - ".gitignore"
25
25
  - ChangeLog
26
+ - LICENSE.txt
26
27
  - Makefile
27
28
  - README.en.rdoc
28
29
  - Rakefile
@@ -60,8 +61,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
60
61
  - !ruby/object:Gem::Version
61
62
  version: '0'
62
63
  requirements: []
63
- rubyforge_project:
64
- rubygems_version: 2.7.3
64
+ rubygems_version: 3.0.3
65
65
  signing_key:
66
66
  specification_version: 4
67
67
  summary: Module to handle Plain-Text