structuredtext 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README ADDED
@@ -0,0 +1,90 @@
1
+ = Structured Text Utilities
2
+
3
+ This module provides utilities for working with structured text.
4
+
5
+ This includes comment handling and delimited field (aka CSV) parsing with
6
+ support for quoted strings.
7
+
8
+ === Commented Text
9
+
10
+ The StructuredText::CommentedReader class removes comments from text.
11
+
12
+ Comments start with a specified comment delimiter (the default is "#") and
13
+ continue to the end of the line. For example, the following text:
14
+
15
+ line 1 # comment 1
16
+ # comment 2
17
+ line 3 # comment 3
18
+
19
+ becomes:
20
+
21
+ line 1
22
+ line 3
23
+
24
+ The default comment delimiter is "#". A different delimiter may be specified
25
+ when the object is created. Blank lines may either be returned or ignored.
26
+
27
+ === Delimited Text
28
+
29
+ The StructuredText::DelimitedReader class parses field-delimited text
30
+ yielding records.
31
+
32
+ In field-delimited text, each line is a record that consists of a series of
33
+ fields delimited by a specified character. When that character is a comma
34
+ these are called comma-separated-value (CSV) files.
35
+
36
+ An array of fields is yielded for each line of field-delimited text. For
37
+ example, the following text:
38
+
39
+ apples, red, round
40
+ bananas, yellow, oblong
41
+
42
+ is parsed into these arrays:
43
+
44
+ ['apples', ' red', ' round']
45
+ ['bananas', ' yellow', ' oblong']
46
+
47
+ The field text may contain quoted strings. Delimiter characters inside
48
+ quotes are not treated as field delimiters. So:
49
+
50
+ apples,"red,green",round
51
+ bananas,yellow,oblong
52
+
53
+ becomes:
54
+
55
+ ['apples', '"red,green"', 'round']
56
+ ['bananas', 'yellow', 'oblong']
57
+
58
+ Note here that the second field of the first line contains the text
59
+ "red,green".
60
+
61
+ The caller may specify custom field delimiter and right- and left-hand quote
62
+ characters.
63
+
64
+ The StructuredText::LabeledDelimitedReader class extends this functionality
65
+ by treating the first line of the text as a header row that contains field
66
+ names. A hash with the field values assigned to their corresponding header
67
+ names is yielded for each line of input. For example, the following text:
68
+
69
+ Fruit,Color,Shape
70
+ apples,red,round
71
+ bananas,yellow,oblong
72
+
73
+ is parsed into these arrays:
74
+
75
+ {"Shape"=>"round", "Fruit"=>"apples", "Color"=>"red"}
76
+ {"Shape"=>"oblong", "Fruit"=>"bananas", "Color"=>"yellow"}
77
+
78
+ = History
79
+
80
+ 1.0.0:: Comment handling and field-delimited text
81
+
82
+ = Copyright
83
+
84
+ Copyright 2009, William Patrick McNeill
85
+
86
+ This program is distributed under the GNU General Public License.
87
+
88
+ = Author
89
+
90
+ W.P. McNeill mailto:billmcn@gmail.com
@@ -0,0 +1,206 @@
1
+ # Copyright 2009 William Patrick McNeill
2
+ #
3
+ # This file is part of StructuredText.
4
+ #
5
+ # StructuredText is free software; you can redistribute it and/or modify it
6
+ # under the terms of the GNU General Public License as published by the Free
7
+ # Software Foundation; either version 2 of the License, or (at your option)
8
+ # any later version.
9
+ #
10
+ # StructuredText is distributed in the hope that it will be useful, but
11
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
12
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13
+ # more details.
14
+ #
15
+ # You should have received a copy of the GNU General Public License along with
16
+ # editalign; if not, write to the Free Software Foundation, Inc., 51 Franklin
17
+ # St, Fifth Floor, Boston, MA 02110-1301 USA
18
+ #
19
+
20
+
21
+ # Utilities for working with various kinds of structured text.
22
+ module StructuredText
23
+ VERSION = "1.0.0"
24
+
25
+
26
+ # Removes comments from text.
27
+ #
28
+ # Comments start with a specified comment delimiter and continue to the end
29
+ # of the line.
30
+ #
31
+ # > StructuredText::CommentedReader.new(<<-EOTEXT
32
+ # " line 1 # comment 1
33
+ # " # comment 2
34
+ # " line 3 # comment 3
35
+ # " EOTEXT
36
+ # > ).collect
37
+ # => ["line 1 ", "", "line 3 "]
38
+ #
39
+ # The default comment delimiter is "#". A different delimiter may be
40
+ # specified when the object is created. Blank lines may either be returned
41
+ # or ignored.
42
+ class CommentedReader
43
+ include Enumerable
44
+
45
+ # Intialize the reader with the text to parse and parameters that
46
+ # determine how comments will be processed.
47
+ #
48
+ # [_source_] an enumerable set of text lines, e.g. a stream or a string.
49
+ # [<em>comment_delimiter</em>] the comment delimiter, the default is
50
+ # <em>"#"</em>
51
+ # [<em>skip_blanks</em>] skip blank lines in the input, the default is
52
+ # _true_
53
+ def initialize(source, comment_delimiter = "#", skip_blanks = true)
54
+ @source = source
55
+ @comment_regex = Regexp.compile(comment_delimiter + '.*$')
56
+ @skip_blanks = skip_blanks
57
+ end
58
+
59
+ # Enumerate the lines in the source, removing all text after a comment
60
+ # character.
61
+ def each # :yields: line with comments removed
62
+ @source.each do |line|
63
+ line.chomp!
64
+ line.sub!(@comment_regex, "")
65
+ yield line if not @skip_blanks or not line.empty?
66
+ end
67
+ end
68
+ end # CommentedReader
69
+
70
+
71
+ # Parses field-delimited text yielding records.
72
+ #
73
+ # In field-delimited text, each line is a record that consists of a series
74
+ # of fields delimited by a specified character. When that character is a
75
+ # comma these are called comma-separated-value (CSV) files.
76
+ #
77
+ # This class enumerates field-delimited text yielding an array of fields for
78
+ # each line of input.
79
+ #
80
+ # > StructuredText::DelimitedReader.new(<<-EOTEXT
81
+ # " apples, red, round
82
+ # " bananas, yellow, oblong
83
+ # " EOTEXT
84
+ # > ).collect
85
+ # => [["apples", " red", " round"], ["bananas", " yellow", " oblong"]]
86
+ #
87
+ # The field text may contain quoted strings. Delimiter characters inside
88
+ # quotes are not treated as field delimiters.
89
+ #
90
+ # > StructuredText::DelimitedReader.new(<<-EOTEXT
91
+ # " apples,"red,green",round
92
+ # " bananas,yellow,oblong
93
+ # " EOTEXT
94
+ # > ).collect
95
+ # => [["apples", "\"red,green\"", "round"], ["bananas", "yellow", "oblong"]]
96
+ #
97
+ # Note here that the second field of the first line contains the text
98
+ # "red,green".
99
+ #
100
+ # The caller may specify custom field delimiter and right- and left-hand
101
+ # quote characters.
102
+ class DelimitedReader
103
+ include Enumerable
104
+
105
+ # Intialize the reader with the text and optional characters that control
106
+ # the parsing format.
107
+ #
108
+ # By default, the field delimiter is a comma (,) and the quote character
109
+ # is a double-quote ("). Both of these defaults can be overridden with
110
+ # arguments passed to this function. The caller may also specify different
111
+ # left-hand and right-hand quote characters, e.g. ( and ).
112
+ #
113
+ # [_source_] an enumerable set of text lines, e.g. a stream or a string.
114
+ # [_delimiter_] the field delimiter character
115
+ # [_lquote_] the left-hand field quote character
116
+ # [_rquote_] the right-hand field quote character; if unspecified, it is
117
+ # identical to the left-hand field quote
118
+ def initialize(source, delimiter = ",", lquote = '"', rquote = nil)
119
+ @source = source
120
+ # Escape the custom characters the caller provides a regular expression
121
+ # control character.
122
+ delimiter = Regexp.escape(delimiter)
123
+ lquote = Regexp.escape(lquote)
124
+ rquote = rquote.nil? ? lquote : Regexp.escape(rquote)
125
+ s = <<-EOTEXT
126
+ (?: # Match delimiter
127
+ (#{delimiter}) # field delimiter
128
+ | # ...or...
129
+ ($) # end of line
130
+ )
131
+ | # ...or...
132
+ ( # Match text
133
+ (?: #{lquote}.*?#{rquote}) # quoted string
134
+ | # ...or...
135
+ (?: [^#{delimiter}]*) # text without delimiters
136
+ )
137
+ EOTEXT
138
+ @field_regex = Regexp.compile(s, Regexp::EXTENDED)
139
+ end
140
+
141
+ # Enumerate the lines in the source yielding arrays of comma-separated
142
+ # fields.
143
+ #
144
+ # A double-quote delimited field may contain non-field-delimiting commas.
145
+ def each
146
+ @source.each do |line|
147
+ line.chomp!
148
+ record = []
149
+ # Scan comma-delimited fields. Allow commas to appear inside
150
+ # double-quoted strings.
151
+ field = ""
152
+ line.scan(@field_regex) do |match|
153
+ comma_delimiter = (not match[0].nil?)
154
+ eol_delimiter = (not match[1].nil?)
155
+ text = match[2]
156
+ if not (comma_delimiter or eol_delimiter)
157
+ # Append text in the middle of a field.
158
+ field += text if not text.nil?
159
+ else
160
+ # Add field to the record at a delimiter.
161
+ record << field
162
+ field = ""
163
+ end
164
+ end # line.scan
165
+ yield record
166
+ end # @source.each
167
+ end
168
+
169
+ end # DelimitedReader
170
+
171
+
172
+ # Parses field-delimited text with a header row yielding record hashes.
173
+ #
174
+ # The first row of the file contains field names. This class yields a hash
175
+ # with the field values assigned to their corresponding header names.
176
+ #
177
+ # > StructuredText::LabeledDelimitedReader.new(<<-EOTEXT
178
+ # " Fruit,Color,Shape
179
+ # " apples,red,round
180
+ # " bananas,yellow,oblong
181
+ # " EOTEXT
182
+ # > ).collect
183
+ # => [{"Shape"=>"round", "Fruit"=>"apples", "Color"=>"red"}, {"Shape"=>"oblong", "Fruit"=>"bananas", "Color"=>"yellow"}]
184
+ #
185
+ # If there are fewer fields in a line than there are headers, the remaining
186
+ # ones will be padded with nil. If there are more fields, an RuntimeError
187
+ # will be raised.
188
+ class LabeledDelimitedReader < DelimitedReader
189
+
190
+ def each # :yields: Hash of column labels and field values
191
+ header_row = nil
192
+ super do |record|
193
+ if header_row.nil?
194
+ header_row = record
195
+ else
196
+ if record.length > header_row.length
197
+ raise "More fields than headers:\n#{record.inspect}"
198
+ end
199
+ yield Hash[*header_row.zip(record).flatten]
200
+ end
201
+ end
202
+ end
203
+
204
+ end # LabeledDelimitedReader
205
+
206
+ end # StructuredText
@@ -0,0 +1,144 @@
1
+ #!/usr/bin/env ruby -w
2
+
3
+ #--
4
+
5
+ # Copyright 2009 William Patrick McNeill
6
+ #
7
+ # This file is part of StructuredText.
8
+ #
9
+ # StructuredText is free software; you can redistribute it and/or modify it
10
+ # under the terms of the GNU General Public License as published by the Free
11
+ # Software Foundation; either version 2 of the License, or (at your option)
12
+ # any later version.
13
+ #
14
+ # StructuredText is distributed in the hope that it will be useful, but
15
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
17
+ # more details.
18
+ #
19
+ # You should have received a copy of the GNU General Public License along with
20
+ # editalign; if not, write to the Free Software Foundation, Inc., 51 Franklin
21
+ # St, Fifth Floor, Boston, MA 02110-1301 USA
22
+ #
23
+ #++
24
+
25
+ # Test cases for the StructuredText module
26
+
27
+ require "test/unit"
28
+ require "structuredtext"
29
+
30
+
31
+ class CommentedReaderTestCase < Test::Unit::TestCase
32
+ def test_basic
33
+ # Canonical
34
+ assert_equal([" one "], StructuredText::CommentedReader.new(" one # comment").collect)
35
+ # Multiline
36
+ assert_equal([" one ", "two"], StructuredText::CommentedReader.new(" one # comment\ntwo").collect)
37
+ end
38
+
39
+ def test_custom_comment_delimiter
40
+ # Canonical
41
+ assert_equal([" one "], StructuredText::CommentedReader.new(" one ; comment", ";").collect)
42
+ # Multiline
43
+ assert_equal([" one ", "two"], StructuredText::CommentedReader.new(" one ; comment\ntwo", ";").collect)
44
+ end
45
+
46
+ def test_skip_blanks
47
+ s = " one # comment\n# comment\n\ntwo\n# comment"
48
+ assert_equal([" one ", "two"], StructuredText::CommentedReader.new(s).collect)
49
+ assert_equal([" one ", "", "", "two", ""], StructuredText::CommentedReader.new(s, "#", false).collect)
50
+ end
51
+
52
+ end # CommentedReaderTestCase
53
+
54
+
55
+ class DelimitedReaderTestCase < Test::Unit::TestCase
56
+ def test_basic
57
+ # Canonical
58
+ assert_equal([["a", "b", "c"]], StructuredText::DelimitedReader.new("a,b,c").collect)
59
+ # Multiline: uniform record length
60
+ assert_equal([["a", "b", "c"], ["d", "e", "f"]], StructuredText::DelimitedReader.new("a,b,c\nd,e,f").collect)
61
+ # Multiline: variying record length
62
+ assert_equal([["a", "b", "c"], ["d", "e"]], StructuredText::DelimitedReader.new("a,b,c\nd,e").collect)
63
+ end
64
+
65
+ def test_quoted
66
+ # Begining
67
+ assert_equal([['"a,b"', "c"]], StructuredText::DelimitedReader.new('"a,b",c').collect)
68
+ # Middle
69
+ assert_equal([["a", '"b,c"', "d"]], StructuredText::DelimitedReader.new('a,"b,c",d').collect)
70
+ # End
71
+ assert_equal([["a",'"b,c"']], StructuredText::DelimitedReader.new('a,"b,c"').collect)
72
+ end
73
+
74
+ def test_empty
75
+ # Begining
76
+ assert_equal([["", "b", "c"]], StructuredText::DelimitedReader.new(",b,c").collect)
77
+ # Middle
78
+ assert_equal([["a", "", "c"]], StructuredText::DelimitedReader.new("a,,c").collect)
79
+ # End
80
+ assert_equal([["a", "b", ""]], StructuredText::DelimitedReader.new("a,b,").collect)
81
+ end
82
+
83
+ def test_single
84
+ assert_equal([["a"]], StructuredText::DelimitedReader.new("a").collect)
85
+ assert_equal([], StructuredText::DelimitedReader.new("").collect)
86
+ end
87
+
88
+ def test_custom_delimiter
89
+ # Canonical
90
+ assert_equal([["a", "b", "c"], ["d", "e", "f"]], StructuredText::DelimitedReader.new("a;b;c\nd;e;f", ";").collect)
91
+ # Quoted string in the middle
92
+ assert_equal([["a", '"b;c"', "d"]], StructuredText::DelimitedReader.new('a;"b;c";d', ";").collect)
93
+ end
94
+
95
+ def test_custom_quote_left_and_right_same
96
+ # Begining
97
+ assert_equal([['|a,b|', 'c']], StructuredText::DelimitedReader.new('|a,b|,c', ",", "|").collect)
98
+ # Middle
99
+ assert_equal([['a', '|b,c|', 'd']], StructuredText::DelimitedReader.new('a,|b,c|,d', ",", "|").collect)
100
+ # End
101
+ assert_equal([['a','|b,c|']], StructuredText::DelimitedReader.new('a,|b,c|', ",", "|").collect)
102
+ end
103
+
104
+ def test_custom_quote_left_and_right_different
105
+ # Begining
106
+ assert_equal([['(a,b)', 'c']], StructuredText::DelimitedReader.new('(a,b),c', ",", "(", ")").collect)
107
+ # Middle
108
+ assert_equal([['a', '(b,c)', 'd']], StructuredText::DelimitedReader.new('a,(b,c),d', ",", "(", ")").collect)
109
+ # End
110
+ assert_equal([['a','(b,c)']], StructuredText::DelimitedReader.new('a,(b,c)', ",", "(", ")").collect)
111
+ end
112
+
113
+ end # DelimitedReaderTestCase
114
+
115
+
116
+ class LabeledReaderTestCase < Test::Unit::TestCase
117
+ def test_basic
118
+ # Canonical
119
+ assert_equal([{"X"=>"a", "Y"=>"b", "Z"=>"c"}], StructuredText::LabeledDelimitedReader.new("X,Y,Z\na,b,c").collect)
120
+ # Multiline
121
+ assert_equal([{"X"=>"a", "Y"=>"b", "Z"=>"c"}, {"X"=>"d", "Y"=>"e", "Z"=>"f"}],
122
+ StructuredText::LabeledDelimitedReader.new("X,Y,Z\na,b,c\nd,e,f").collect)
123
+ end
124
+
125
+ def test_exception
126
+ assert_raise(RuntimeError) { StructuredText::LabeledDelimitedReader.new("X,Y,Z\na,b,c,d").collect }
127
+ end
128
+ end
129
+
130
+
131
+ class ScenarioTestCase < Test::Unit::TestCase
132
+ def test_commented_labeled_text_with_all_custom_characters
133
+ text =<<-EOTEXT
134
+ ; This is the header row
135
+ Fruit|Color|Shape
136
+ apples|(red|green)|round ; The first data row
137
+ bananas|yellow|oblong
138
+ ; The end
139
+ EOTEXT
140
+ r = StructuredText::LabeledDelimitedReader.new(StructuredText::CommentedReader.new(text, ";"), "|", "(", ")")
141
+ assert_equal([{'Shape'=>'round ', 'Fruit'=>'apples', 'Color'=>'(red|green)'},
142
+ {'Shape'=>'oblong', 'Fruit'=>'bananas', 'Color'=>'yellow'}], r.collect)
143
+ end
144
+ end
metadata ADDED
@@ -0,0 +1,60 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: structuredtext
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - W.P. McNeill
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-08-11 00:00:00 -07:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description: This module provides utilities for working with various kinds of structured text. It includes comment handling and comma-separated-value (CSV) parsing with support for quoted strings.
17
+ email: billmcn@gmail.com
18
+ executables: []
19
+
20
+ extensions: []
21
+
22
+ extra_rdoc_files:
23
+ - README
24
+ files:
25
+ - test/test_structuredtext.rb
26
+ - lib/structuredtext.rb
27
+ - README
28
+ has_rdoc: true
29
+ homepage: http://structuredtext.rubyforge.org/
30
+ post_install_message:
31
+ rdoc_options:
32
+ - - --title
33
+ - StructuredText -- Structured Text Utilities
34
+ - --main
35
+ - README
36
+ - --line-numbers
37
+ - --inline-source
38
+ require_paths:
39
+ - lib
40
+ required_ruby_version: !ruby/object:Gem::Requirement
41
+ requirements:
42
+ - - ">="
43
+ - !ruby/object:Gem::Version
44
+ version: "0"
45
+ version:
46
+ required_rubygems_version: !ruby/object:Gem::Requirement
47
+ requirements:
48
+ - - ">="
49
+ - !ruby/object:Gem::Version
50
+ version: "0"
51
+ version:
52
+ requirements: []
53
+
54
+ rubyforge_project: structuredtext
55
+ rubygems_version: 1.1.1
56
+ signing_key:
57
+ specification_version: 2
58
+ summary: Structued text processing utilities
59
+ test_files:
60
+ - test/test_structuredtext.rb