plain_text 0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +51 -0
- data/ChangeLog +5 -0
- data/Makefile +23 -0
- data/README.en.rdoc +172 -0
- data/Rakefile +9 -0
- data/bin/countchar +89 -0
- data/lib/plain_text/parse_rule.rb +474 -0
- data/lib/plain_text/part/boundary.rb +44 -0
- data/lib/plain_text/part/paragraph.rb +35 -0
- data/lib/plain_text/part.rb +973 -0
- data/lib/plain_text/split.rb +103 -0
- data/lib/plain_text/util.rb +104 -0
- data/lib/plain_text.rb +839 -0
- data/plain_text.gemspec +49 -0
- data/test/test_plain_text.rb +280 -0
- data/test/test_plain_text_parse_rule.rb +146 -0
- data/test/test_plain_text_part.rb +353 -0
- data/test/test_plain_text_split.rb +78 -0
- metadata +72 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 52d4c007bf2d127b5fed9c1edd5df87597ee5ca4689818244ecb05bcb6d0a8f1
|
4
|
+
data.tar.gz: 5af8e4489d714e272c8304911cbfdac18a736a49abf35d5143fd86ddf1d7e917
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 422f9de75686466409ce8d9c819226313974d6d308aa66dbede7b808aed8230b5efb3bbf838151d00031a9157e9109c936e3af3bc2dc51e447bce9fee2bfd81d
|
7
|
+
data.tar.gz: af8ca2904b51fb3ea3b822a49ae3e650ef5112af8f63d4b1df51ade1b31a0d083f712bf2b4f64f5eba17624c11264e6b7ffc8de8ce866541ef8d47a160be299c
|
data/.gitignore
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
# See https://help.github.com/articles/ignoring-files for more about ignoring files.
|
2
|
+
#
|
3
|
+
# If you find yourself ignoring temporary files generated by your text editor
|
4
|
+
# or operating system, you probably want to add a global ignore instead:
|
5
|
+
# git config --global core.excludesfile '~/.gitignore_global'
|
6
|
+
|
7
|
+
# Ignore bundler config.
|
8
|
+
/.bundle
|
9
|
+
/vendor/bundle
|
10
|
+
|
11
|
+
# Ignore all logfiles and tempfiles.
|
12
|
+
/log/*
|
13
|
+
/tmp/*
|
14
|
+
!/log/.keep
|
15
|
+
!/tmp/.keep
|
16
|
+
|
17
|
+
.rbenv-version
|
18
|
+
# unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
|
19
|
+
.rvmrc
|
20
|
+
|
21
|
+
/node_modules
|
22
|
+
/yarn-error.log
|
23
|
+
|
24
|
+
.byebug_history
|
25
|
+
|
26
|
+
*.[oa]
|
27
|
+
*.so
|
28
|
+
*~
|
29
|
+
*.nogem
|
30
|
+
*nogem.*
|
31
|
+
*.bak
|
32
|
+
*.BAK
|
33
|
+
*.backup
|
34
|
+
*.org
|
35
|
+
*.orig
|
36
|
+
*.elc
|
37
|
+
*.pyc
|
38
|
+
\#*\#
|
39
|
+
|
40
|
+
# Elastic Beanstalk Files
|
41
|
+
.elasticbeanstalk/*
|
42
|
+
!.elasticbeanstalk/*.cfg.yml
|
43
|
+
!.elasticbeanstalk/*.global.yml
|
44
|
+
|
45
|
+
# yard
|
46
|
+
*.yardoc
|
47
|
+
|
48
|
+
# Ruby Gem doc
|
49
|
+
*.gem
|
50
|
+
doc/*
|
51
|
+
|
data/ChangeLog
ADDED
data/Makefile
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
ALL =
|
2
|
+
|
3
|
+
objs =
|
4
|
+
|
5
|
+
.SUFFIXES: .so .o .c .f
|
6
|
+
|
7
|
+
#.o.so:
|
8
|
+
# ${LD} ${LFLAGS} -o $@ $< ${LINK_LIB}
|
9
|
+
|
10
|
+
all: ${ALL}
|
11
|
+
|
12
|
+
|
13
|
+
.PHONY: clean test doc
|
14
|
+
clean:
|
15
|
+
$(RM) bin/*~
|
16
|
+
|
17
|
+
## You may need RUBYLIB=`pwd`/lib:$RUBYLIB
|
18
|
+
test:
|
19
|
+
rake test
|
20
|
+
|
21
|
+
doc:
|
22
|
+
yard doc
|
23
|
+
|
data/README.en.rdoc
ADDED
@@ -0,0 +1,172 @@
|
|
1
|
+
|
2
|
+
= PlainText - Module and classes to handle plain text
|
3
|
+
|
4
|
+
== Summary
|
5
|
+
|
6
|
+
This module provides utility functions and methods to handle plain
|
7
|
+
text. In the namespace, classes Part/Paragraph/Boundary are defined,
|
8
|
+
which represent the logical structure of a document and another class
|
9
|
+
ParseRule, which describes the rules to parse plain text to produce a Part-type Ruby instance.
|
10
|
+
This package also provides a command-line program to count the number
|
11
|
+
of characters, especially useful for documents in Asian (CJK) chatacters.
|
12
|
+
|
13
|
+
== Design concept
|
14
|
+
|
15
|
+
=== PlainText - Module and root Namespace
|
16
|
+
|
17
|
+
The original plain text should be in String in Ruby.
|
18
|
+
|
19
|
+
The module {PlainText} offers some useful methods, such as,
|
20
|
+
{PlainText#head} and {PlainText#tail}. They are meant to be included
|
21
|
+
in String. However, it also contains some useful module functions,
|
22
|
+
such as, {PlainText.clean_text} and {PlainText.count_char}.
|
23
|
+
|
24
|
+
=== PlainText::Part - Core class to describe the logical structure
|
25
|
+
|
26
|
+
In the namespace of this module, it contains {PlainText::Part} class,
|
27
|
+
which is the heart to describe the logical structure of documents.
|
28
|
+
It is basically a container class and indeed a sub-class of Array. It
|
29
|
+
can contain either of another {PlainText::Part} or more basic
|
30
|
+
components of either of {PlainText::Part::Paragraph} and
|
31
|
+
{PlainText::Part::Boundary}, both of which are sub-classes of String.
|
32
|
+
|
33
|
+
An example instance looks like this:
|
34
|
+
|
35
|
+
Part (
|
36
|
+
(0) Paragraph::Empty,
|
37
|
+
(1) Boundary::General,
|
38
|
+
(2) Part::ArticleHeader(
|
39
|
+
(0) Paragraph::Title,
|
40
|
+
(1) Boundary::Empty
|
41
|
+
),
|
42
|
+
(3) Boundary::TitleMain,
|
43
|
+
(4) Part::ArticleMain(
|
44
|
+
(0) Part::ArticleSection(
|
45
|
+
(0) Paragraph::Title,
|
46
|
+
(1) Boundary::General,
|
47
|
+
(2) Paragraph::General,
|
48
|
+
(3) Boundary::General,
|
49
|
+
(4) Part::ArticleSubSection(...),
|
50
|
+
(5) Boundary::General,
|
51
|
+
(6) Paragraph::General,
|
52
|
+
(7) Boundary::Empty
|
53
|
+
),
|
54
|
+
(1) Boundary::General,
|
55
|
+
(2) Paragraph::General,
|
56
|
+
(3) Boundary::Empty
|
57
|
+
),
|
58
|
+
(5) Boundary::General
|
59
|
+
)
|
60
|
+
|
61
|
+
where the name of subclasses (or constants) here arbitrary, except for
|
62
|
+
{PlainText::Part::Paragraph::Empty} and
|
63
|
+
{PlainText::Part::Boundary::Empty}, which are pre-defined. Users can
|
64
|
+
define their own subclasses to help organize the logical structure at
|
65
|
+
their will.
|
66
|
+
|
67
|
+
Basically, at every layer, every {PlainText::Part} or
|
68
|
+
{PlainText::Part::Paragraph} is sandwiched by
|
69
|
+
{PlainText::Part::Boundary}, except for the very first one.
|
70
|
+
|
71
|
+
By performing +join+ method, one can retrieve the entire document as a
|
72
|
+
String instance any time.
|
73
|
+
|
74
|
+
=== PlainText::ParseRule - Class to describe the rule of how to parse
|
75
|
+
|
76
|
+
{PlainText::ParseRule} is the class to describe how to parse initially
|
77
|
+
String, and subsequently {PlainText::Part}, which is basically an Array.
|
78
|
+
{PlainText::ParseRule} is a container class and holds a set of ordered
|
79
|
+
rules, each of which is either Proc or Regexp as a more simple rule.
|
80
|
+
A rule, Proc, is defined by a user and is designed to receive either
|
81
|
+
String (the first application only) or {PlainText::ParseRule} (Array)
|
82
|
+
and to return a fully (or partially) parsed {PlainText::ParseRule}.
|
83
|
+
In short, the rule descries how to determine from where to where a
|
84
|
+
paragraphs and boundaries are located, and maybe what and where the
|
85
|
+
sections and sub-sections and so on are.
|
86
|
+
|
87
|
+
For example, if a rule is Regexp, it describes how to split a String;
|
88
|
+
it is applied to String in the first application, but if it is
|
89
|
+
applied (and maybe registered as such) at the second or later stage,
|
90
|
+
it is applied to each Paragraph and Section separately to split them further.
|
91
|
+
|
92
|
+
{PlainText::ParseRule#apply} and {PlainText::Part.parse} are the
|
93
|
+
standard methods to apply the rules to an object (either String or
|
94
|
+
{PlainText::Part}.
|
95
|
+
|
96
|
+
== Command-line tool
|
97
|
+
|
98
|
+
=== countchar
|
99
|
+
|
100
|
+
Counts the number of characters in a file(s) or STDIN.
|
101
|
+
|
102
|
+
The simplest example to run the command-line script is
|
103
|
+
countchar YourFile.txt
|
104
|
+
|
105
|
+
You may start with
|
106
|
+
countchar --help
|
107
|
+
to see the available options.
|
108
|
+
|
109
|
+
== Miscellaneous
|
110
|
+
|
111
|
+
Module {PlainText::Split} contains an instance method (and class
|
112
|
+
method with the same name) {PlainText::Split#split_with_delimiter},
|
113
|
+
which is included in String in default. The method realises a
|
114
|
+
reversible split of String with a delimiter of an arbitrary Regexp.
|
115
|
+
|
116
|
+
In the standard String#split, the following is the result, when
|
117
|
+
sent by a String instance +s+ = +"XQabXXcXQ"+:
|
118
|
+
|
119
|
+
s.split(/X+Q?/) #=> ["", "ab", "c"],
|
120
|
+
s.split(/X+Q?/, -1) #=> ["", "ab", "c", ""],
|
121
|
+
s.split(/X+(Q?)/, -1) #=> ["", "Q", "ab", "", "c", "Q", ""],
|
122
|
+
s.split(/(X+(Q?))/, -1) #=> ["", "XQ", "Q", "ab", "XX", "", "c", "XQ", "Q", ""],
|
123
|
+
|
124
|
+
With this method,
|
125
|
+
|
126
|
+
s.split_with_delimiter(/X+(Q?)/)
|
127
|
+
#=> ["", "XQ", "ab", "XX", "c", "XQ"]
|
128
|
+
|
129
|
+
from which the original string is always easily recovered by simple +join+.
|
130
|
+
|
131
|
+
Also, {PlainText::Util} contains some miscellaneous methods.
|
132
|
+
|
133
|
+
== Description
|
134
|
+
|
135
|
+
Work in progress...
|
136
|
+
|
137
|
+
It is still in a preliminary state.
|
138
|
+
|
139
|
+
== Install
|
140
|
+
|
141
|
+
This script requires {Ruby}[http://www.ruby-lang.org] Version 2.0
|
142
|
+
or above (possibley 2.2 or above?).
|
143
|
+
|
144
|
+
As for the command-line script file, it can be put in any of your command-line search
|
145
|
+
paths. Make sure the RUBYLIB environment
|
146
|
+
variable contains the library directory to this gem, which is
|
147
|
+
/THIS/GEM/LIBRARY/PATH/plain_text/lib
|
148
|
+
|
149
|
+
You may need to modify the first line (Shebang line) of the script to suit your
|
150
|
+
environment (it should be unnecessary for Linux and MacOS), or run it
|
151
|
+
explicitly with your Ruby command as
|
152
|
+
Prompt% /YOUR/ENV/ruby /YOUR/INSTALLED/countchar
|
153
|
+
|
154
|
+
== Developer's note
|
155
|
+
|
156
|
+
=== Tests
|
157
|
+
|
158
|
+
Ruby codes under the directory <tt>test/</tt> are the test scripts.
|
159
|
+
You can run them from the top directory as <tt>ruby test/test_****.rb</tt>
|
160
|
+
or simply run <tt>make test</tt>.
|
161
|
+
|
162
|
+
|
163
|
+
== Known bugs
|
164
|
+
|
165
|
+
None.
|
166
|
+
|
167
|
+
|
168
|
+
== Copyright
|
169
|
+
|
170
|
+
Author:: Masa Sakano < info a_t wisebabel dot com >
|
171
|
+
Versions:: The versions of this package follow Semantic Versioning (2.0.0) http://semver.org/
|
172
|
+
|
data/Rakefile
ADDED
data/bin/countchar
ADDED
@@ -0,0 +1,89 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
|
4
|
+
require 'optparse'
|
5
|
+
require 'plain_text'
|
6
|
+
|
7
|
+
BANNER = <<"__EOF__"
|
8
|
+
USAGE: #{File.basename($0)} [options] [INFILE.txt] < STDIN
|
9
|
+
Print the number of characters in INFILE (or STDIN).
|
10
|
+
__EOF__
|
11
|
+
|
12
|
+
# Initialising the hash for the command-line options.
|
13
|
+
OPTS = {
|
14
|
+
preserve_paragraph: true,
|
15
|
+
boundary_style: true,
|
16
|
+
lbs_style: :t, # :truncate,
|
17
|
+
lb_is_space: false,
|
18
|
+
sps_style: :truncate,
|
19
|
+
delete_asian_space: true,
|
20
|
+
linehead_style: :delete,
|
21
|
+
linetail_style: :delete,
|
22
|
+
firstsps_style: :delete,
|
23
|
+
lastsps_style: :truncate,
|
24
|
+
line_i: nil,
|
25
|
+
line_f: nil,
|
26
|
+
# :chatter => 3, # Default
|
27
|
+
debug: false,
|
28
|
+
}
|
29
|
+
|
30
|
+
# Function to handle the command-line arguments.
|
31
|
+
#
|
32
|
+
# ARGV will be modified, and the constant variable OPTS is set.
|
33
|
+
#
|
34
|
+
# @return [Hash] Optional-argument hash.
|
35
|
+
#
|
36
|
+
def handle_argv
|
37
|
+
opt = OptionParser.new(BANNER)
|
38
|
+
opt.on( '--[no-]preserve_paragraph', sprintf("Preserved paragraph structures? (Def: %s)", OPTS[:preserve_paragraph].inspect), TrueClass) {|v| OPTS[:preserve_paragraph] = v}
|
39
|
+
opt.on( '--lbs-style=STYLE', sprintf("One of (t(runcate)|d(elete)|n(one)) (Def: truncate).", Symbol)) { |v| OPTS[:lbs_style]=v.strip[0].to_sym }
|
40
|
+
# opt.on( '--version', "Display the version and exits.", TrueClass) {|v| OPTS[:version] = v} # Consider opts.on_tail
|
41
|
+
opt.on( '--[no-]debug', "Debug (Def: false)", TrueClass) {|v| OPTS[:debug] = v}
|
42
|
+
opt.separator "" # Way to control a help message.
|
43
|
+
opt.separator "Note:"
|
44
|
+
opt.separator " Spaces are truncated in default."
|
45
|
+
|
46
|
+
opt.parse!(ARGV)
|
47
|
+
|
48
|
+
unless %i(t d n).include? OPTS[:lbs_style]
|
49
|
+
warn "ERROR: --lbs-style must be one of (t(runcate)|d(elete)|n(one))."; exit 1
|
50
|
+
end
|
51
|
+
|
52
|
+
OPTS
|
53
|
+
end
|
54
|
+
|
55
|
+
|
56
|
+
################################################
|
57
|
+
# MAIN
|
58
|
+
################################################
|
59
|
+
|
60
|
+
$stdout.sync=true
|
61
|
+
$stderr.sync=true
|
62
|
+
|
63
|
+
class String
|
64
|
+
include PlainText
|
65
|
+
end
|
66
|
+
|
67
|
+
# Handle the command-line options => OPTS
|
68
|
+
opts = handle_argv()
|
69
|
+
|
70
|
+
str = ARGF.read
|
71
|
+
|
72
|
+
puts str.count_char(
|
73
|
+
preserve_paragraph: opts[:preserve_paragraph],
|
74
|
+
boundary_style: opts[:boundary_style],
|
75
|
+
lbs_style: opts[:lbs_style],
|
76
|
+
lb_is_space: opts[:lb_is_space],
|
77
|
+
sps_style: opts[:sps_style],
|
78
|
+
delete_asian_space: opts[:delete_asian_space],
|
79
|
+
linehead_style: opts[:linehead_style],
|
80
|
+
linetail_style: opts[:linetail_style],
|
81
|
+
firstsps_style: opts[:firstsps_style],
|
82
|
+
lastsps_style: opts[:lastsps_style],
|
83
|
+
)
|
84
|
+
|
85
|
+
exit
|
86
|
+
|
87
|
+
__END__
|
88
|
+
|
89
|
+
|