structuredtext 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
data/README ADDED
@@ -0,0 +1,90 @@
1
+ = Structured Text Utilities
2
+
3
+ This module provides utilities for working with structured text.
4
+
5
+ This includes comment handling and delimited field (aka CSV) parsing with
6
+ support for quoted strings.
7
+
8
+ === Commented Text
9
+
10
+ The StructuredText::CommentedReader class removes comments from text.
11
+
12
+ Comments start with a specified comment delimiter (the default is "#") and
13
+ continue to the end of the line. For example, the following text:
14
+
15
+ line 1 # comment 1
16
+ # comment 2
17
+ line 3 # comment 3
18
+
19
+ becomes:
20
+
21
+ line 1
22
+ line 3
23
+
24
+ The default comment delimiter is "#". A different delimiter may be specified
25
+ when the object is created. Blank lines may either be returned or ignored.
26
+
27
+ === Delimited Text
28
+
29
+ The StructuredText::DelimitedReader class parses field-delimited text
30
+ yielding records.
31
+
32
+ In field-delimited text, each line is a record that consists of a series of
33
+ fields delimited by a specified character. When that character is a comma
34
+ these are called comma-separated-value (CSV) files.
35
+
36
+ An array of fields is yielded for each line of field-delimited text. For
37
+ example, the following text:
38
+
39
+ apples, red, round
40
+ bananas, yellow, oblong
41
+
42
+ is parsed into these arrays:
43
+
44
+ ['apples', ' red', ' round']
45
+ ['bananas', ' yellow', ' oblong']
46
+
47
+ The field text may contain quoted strings. Delimiter characters inside
48
+ quotes are not treated as field delimiters. So:
49
+
50
+ apples,"red,green",round
51
+ bananas,yellow,oblong
52
+
53
+ becomes:
54
+
55
+ ['apples', '"red,green"', 'round']
56
+ ['bananas', 'yellow', 'oblong']
57
+
58
+ Note here that the second field of the first line contains the text
59
+ "red,green".
60
+
61
+ The caller may specify custom field delimiter and right- and left-hand quote
62
+ characters.
63
+
64
+ The StructuredText::LabeledDelimitedReader class extends this functionality
65
+ by treating the first line of the text as a header row that contains field
66
+ names. A hash with the field values assigned to their corresponding header
67
+ names is yielded for each line of input. For example, the following text:
68
+
69
+ Fruit,Color,Shape
70
+ apples,red,round
71
+ bananas,yellow,oblong
72
+
73
+ is parsed into these arrays:
74
+
75
+ {"Shape"=>"round", "Fruit"=>"apples", "Color"=>"red"}
76
+ {"Shape"=>"oblong", "Fruit"=>"bananas", "Color"=>"yellow"}
77
+
78
+ = History
79
+
80
+ 1.0.0:: Comment handling and field-delimited text
81
+
82
+ = Copyright
83
+
84
+ Copyright 2009, William Patrick McNeill
85
+
86
+ This program is distributed under the GNU General Public License.
87
+
88
+ = Author
89
+
90
+ W.P. McNeill mailto:billmcn@gmail.com
@@ -0,0 +1,206 @@
1
+ # Copyright 2009 William Patrick McNeill
2
+ #
3
+ # This file is part of StructuredText.
4
+ #
5
+ # StructuredText is free software; you can redistribute it and/or modify it
6
+ # under the terms of the GNU General Public License as published by the Free
7
+ # Software Foundation; either version 2 of the License, or (at your option)
8
+ # any later version.
9
+ #
10
+ # StructuredText is distributed in the hope that it will be useful, but
11
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
12
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13
+ # more details.
14
+ #
15
+ # You should have received a copy of the GNU General Public License along with
16
+ # editalign; if not, write to the Free Software Foundation, Inc., 51 Franklin
17
+ # St, Fifth Floor, Boston, MA 02110-1301 USA
18
+ #
19
+
20
+
21
+ # Utilities for working with various kinds of structured text.
22
+ module StructuredText
23
+ VERSION = "1.0.0"
24
+
25
+
26
+ # Removes comments from text.
27
+ #
28
+ # Comments start with a specified comment delimiter and continue to the end
29
+ # of the line.
30
+ #
31
+ # > StructuredText::CommentedReader.new(<<-EOTEXT
32
+ # " line 1 # comment 1
33
+ # " # comment 2
34
+ # " line 3 # comment 3
35
+ # " EOTEXT
36
+ # > ).collect
37
+ # => ["line 1 ", "", "line 3 "]
38
+ #
39
+ # The default comment delimiter is "#". A different delimiter may be
40
+ # specified when the object is created. Blank lines may either be returned
41
+ # or ignored.
42
+ class CommentedReader
43
+ include Enumerable
44
+
45
+ # Intialize the reader with the text to parse and parameters that
46
+ # determine how comments will be processed.
47
+ #
48
+ # [_source_] an enumerable set of text lines, e.g. a stream or a string.
49
+ # [<em>comment_delimiter</em>] the comment delimiter, the default is
50
+ # <em>"#"</em>
51
+ # [<em>skip_blanks</em>] skip blank lines in the input, the default is
52
+ # _true_
53
+ def initialize(source, comment_delimiter = "#", skip_blanks = true)
54
+ @source = source
55
+ @comment_regex = Regexp.compile(comment_delimiter + '.*$')
56
+ @skip_blanks = skip_blanks
57
+ end
58
+
59
+ # Enumerate the lines in the source, removing all text after a comment
60
+ # character.
61
+ def each # :yields: line with comments removed
62
+ @source.each do |line|
63
+ line.chomp!
64
+ line.sub!(@comment_regex, "")
65
+ yield line if not @skip_blanks or not line.empty?
66
+ end
67
+ end
68
+ end # CommentedReader
69
+
70
+
71
+ # Parses field-delimited text yielding records.
72
+ #
73
+ # In field-delimited text, each line is a record that consists of a series
74
+ # of fields delimited by a specified character. When that character is a
75
+ # comma these are called comma-separated-value (CSV) files.
76
+ #
77
+ # This class enumerates field-delimited text yielding an array of fields for
78
+ # each line of input.
79
+ #
80
+ # > StructuredText::DelimitedReader.new(<<-EOTEXT
81
+ # " apples, red, round
82
+ # " bananas, yellow, oblong
83
+ # " EOTEXT
84
+ # > ).collect
85
+ # => [["apples", " red", " round"], ["bananas", " yellow", " oblong"]]
86
+ #
87
+ # The field text may contain quoted strings. Delimiter characters inside
88
+ # quotes are not treated as field delimiters.
89
+ #
90
+ # > StructuredText::DelimitedReader.new(<<-EOTEXT
91
+ # " apples,"red,green",round
92
+ # " bananas,yellow,oblong
93
+ # " EOTEXT
94
+ # > ).collect
95
+ # => [["apples", "\"red,green\"", "round"], ["bananas", "yellow", "oblong"]]
96
+ #
97
+ # Note here that the second field of the first line contains the text
98
+ # "red,green".
99
+ #
100
+ # The caller may specify custom field delimiter and right- and left-hand
101
+ # quote characters.
102
+ class DelimitedReader
103
+ include Enumerable
104
+
105
+ # Intialize the reader with the text and optional characters that control
106
+ # the parsing format.
107
+ #
108
+ # By default, the field delimiter is a comma (,) and the quote character
109
+ # is a double-quote ("). Both of these defaults can be overridden with
110
+ # arguments passed to this function. The caller may also specify different
111
+ # left-hand and right-hand quote characters, e.g. ( and ).
112
+ #
113
+ # [_source_] an enumerable set of text lines, e.g. a stream or a string.
114
+ # [_delimiter_] the field delimiter character
115
+ # [_lquote_] the left-hand field quote character
116
+ # [_rquote_] the right-hand field quote character; if unspecified, it is
117
+ # identical to the left-hand field quote
118
+ def initialize(source, delimiter = ",", lquote = '"', rquote = nil)
119
+ @source = source
120
+ # Escape the custom characters the caller provides a regular expression
121
+ # control character.
122
+ delimiter = Regexp.escape(delimiter)
123
+ lquote = Regexp.escape(lquote)
124
+ rquote = rquote.nil? ? lquote : Regexp.escape(rquote)
125
+ s = <<-EOTEXT
126
+ (?: # Match delimiter
127
+ (#{delimiter}) # field delimiter
128
+ | # ...or...
129
+ ($) # end of line
130
+ )
131
+ | # ...or...
132
+ ( # Match text
133
+ (?: #{lquote}.*?#{rquote}) # quoted string
134
+ | # ...or...
135
+ (?: [^#{delimiter}]*) # text without delimiters
136
+ )
137
+ EOTEXT
138
+ @field_regex = Regexp.compile(s, Regexp::EXTENDED)
139
+ end
140
+
141
+ # Enumerate the lines in the source yielding arrays of comma-separated
142
+ # fields.
143
+ #
144
+ # A double-quote delimited field may contain non-field-delimiting commas.
145
+ def each
146
+ @source.each do |line|
147
+ line.chomp!
148
+ record = []
149
+ # Scan comma-delimited fields. Allow commas to appear inside
150
+ # double-quoted strings.
151
+ field = ""
152
+ line.scan(@field_regex) do |match|
153
+ comma_delimiter = (not match[0].nil?)
154
+ eol_delimiter = (not match[1].nil?)
155
+ text = match[2]
156
+ if not (comma_delimiter or eol_delimiter)
157
+ # Append text in the middle of a field.
158
+ field += text if not text.nil?
159
+ else
160
+ # Add field to the record at a delimiter.
161
+ record << field
162
+ field = ""
163
+ end
164
+ end # line.scan
165
+ yield record
166
+ end # @source.each
167
+ end
168
+
169
+ end # DelimitedReader
170
+
171
+
172
+ # Parses field-delimited text with a header row yielding record hashes.
173
+ #
174
+ # The first row of the file contains field names. This class yields a hash
175
+ # with the field values assigned to their corresponding header names.
176
+ #
177
+ # > StructuredText::LabeledDelimitedReader.new(<<-EOTEXT
178
+ # " Fruit,Color,Shape
179
+ # " apples,red,round
180
+ # " bananas,yellow,oblong
181
+ # " EOTEXT
182
+ # > ).collect
183
+ # => [{"Shape"=>"round", "Fruit"=>"apples", "Color"=>"red"}, {"Shape"=>"oblong", "Fruit"=>"bananas", "Color"=>"yellow"}]
184
+ #
185
+ # If there are fewer fields in a line than there are headers, the remaining
186
+ # ones will be padded with nil. If there are more fields, an RuntimeError
187
+ # will be raised.
188
+ class LabeledDelimitedReader < DelimitedReader
189
+
190
+ def each # :yields: Hash of column labels and field values
191
+ header_row = nil
192
+ super do |record|
193
+ if header_row.nil?
194
+ header_row = record
195
+ else
196
+ if record.length > header_row.length
197
+ raise "More fields than headers:\n#{record.inspect}"
198
+ end
199
+ yield Hash[*header_row.zip(record).flatten]
200
+ end
201
+ end
202
+ end
203
+
204
+ end # LabeledDelimitedReader
205
+
206
+ end # StructuredText
@@ -0,0 +1,144 @@
1
+ #!/usr/bin/env ruby -w
2
+
3
+ #--
4
+
5
+ # Copyright 2009 William Patrick McNeill
6
+ #
7
+ # This file is part of StructuredText.
8
+ #
9
+ # StructuredText is free software; you can redistribute it and/or modify it
10
+ # under the terms of the GNU General Public License as published by the Free
11
+ # Software Foundation; either version 2 of the License, or (at your option)
12
+ # any later version.
13
+ #
14
+ # StructuredText is distributed in the hope that it will be useful, but
15
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
17
+ # more details.
18
+ #
19
+ # You should have received a copy of the GNU General Public License along with
20
+ # editalign; if not, write to the Free Software Foundation, Inc., 51 Franklin
21
+ # St, Fifth Floor, Boston, MA 02110-1301 USA
22
+ #
23
+ #++
24
+
25
+ # Test cases for the StructuredText module
26
+
27
+ require "test/unit"
28
+ require "structuredtext"
29
+
30
+
31
+ class CommentedReaderTestCase < Test::Unit::TestCase
32
+ def test_basic
33
+ # Canonical
34
+ assert_equal([" one "], StructuredText::CommentedReader.new(" one # comment").collect)
35
+ # Multiline
36
+ assert_equal([" one ", "two"], StructuredText::CommentedReader.new(" one # comment\ntwo").collect)
37
+ end
38
+
39
+ def test_custom_comment_delimiter
40
+ # Canonical
41
+ assert_equal([" one "], StructuredText::CommentedReader.new(" one ; comment", ";").collect)
42
+ # Multiline
43
+ assert_equal([" one ", "two"], StructuredText::CommentedReader.new(" one ; comment\ntwo", ";").collect)
44
+ end
45
+
46
+ def test_skip_blanks
47
+ s = " one # comment\n# comment\n\ntwo\n# comment"
48
+ assert_equal([" one ", "two"], StructuredText::CommentedReader.new(s).collect)
49
+ assert_equal([" one ", "", "", "two", ""], StructuredText::CommentedReader.new(s, "#", false).collect)
50
+ end
51
+
52
+ end # CommentedReaderTestCase
53
+
54
+
55
+ class DelimitedReaderTestCase < Test::Unit::TestCase
56
+ def test_basic
57
+ # Canonical
58
+ assert_equal([["a", "b", "c"]], StructuredText::DelimitedReader.new("a,b,c").collect)
59
+ # Multiline: uniform record length
60
+ assert_equal([["a", "b", "c"], ["d", "e", "f"]], StructuredText::DelimitedReader.new("a,b,c\nd,e,f").collect)
61
+ # Multiline: variying record length
62
+ assert_equal([["a", "b", "c"], ["d", "e"]], StructuredText::DelimitedReader.new("a,b,c\nd,e").collect)
63
+ end
64
+
65
+ def test_quoted
66
+ # Begining
67
+ assert_equal([['"a,b"', "c"]], StructuredText::DelimitedReader.new('"a,b",c').collect)
68
+ # Middle
69
+ assert_equal([["a", '"b,c"', "d"]], StructuredText::DelimitedReader.new('a,"b,c",d').collect)
70
+ # End
71
+ assert_equal([["a",'"b,c"']], StructuredText::DelimitedReader.new('a,"b,c"').collect)
72
+ end
73
+
74
+ def test_empty
75
+ # Begining
76
+ assert_equal([["", "b", "c"]], StructuredText::DelimitedReader.new(",b,c").collect)
77
+ # Middle
78
+ assert_equal([["a", "", "c"]], StructuredText::DelimitedReader.new("a,,c").collect)
79
+ # End
80
+ assert_equal([["a", "b", ""]], StructuredText::DelimitedReader.new("a,b,").collect)
81
+ end
82
+
83
+ def test_single
84
+ assert_equal([["a"]], StructuredText::DelimitedReader.new("a").collect)
85
+ assert_equal([], StructuredText::DelimitedReader.new("").collect)
86
+ end
87
+
88
+ def test_custom_delimiter
89
+ # Canonical
90
+ assert_equal([["a", "b", "c"], ["d", "e", "f"]], StructuredText::DelimitedReader.new("a;b;c\nd;e;f", ";").collect)
91
+ # Quoted string in the middle
92
+ assert_equal([["a", '"b;c"', "d"]], StructuredText::DelimitedReader.new('a;"b;c";d', ";").collect)
93
+ end
94
+
95
+ def test_custom_quote_left_and_right_same
96
+ # Begining
97
+ assert_equal([['|a,b|', 'c']], StructuredText::DelimitedReader.new('|a,b|,c', ",", "|").collect)
98
+ # Middle
99
+ assert_equal([['a', '|b,c|', 'd']], StructuredText::DelimitedReader.new('a,|b,c|,d', ",", "|").collect)
100
+ # End
101
+ assert_equal([['a','|b,c|']], StructuredText::DelimitedReader.new('a,|b,c|', ",", "|").collect)
102
+ end
103
+
104
+ def test_custom_quote_left_and_right_different
105
+ # Begining
106
+ assert_equal([['(a,b)', 'c']], StructuredText::DelimitedReader.new('(a,b),c', ",", "(", ")").collect)
107
+ # Middle
108
+ assert_equal([['a', '(b,c)', 'd']], StructuredText::DelimitedReader.new('a,(b,c),d', ",", "(", ")").collect)
109
+ # End
110
+ assert_equal([['a','(b,c)']], StructuredText::DelimitedReader.new('a,(b,c)', ",", "(", ")").collect)
111
+ end
112
+
113
+ end # DelimitedReaderTestCase
114
+
115
+
116
+ class LabeledReaderTestCase < Test::Unit::TestCase
117
+ def test_basic
118
+ # Canonical
119
+ assert_equal([{"X"=>"a", "Y"=>"b", "Z"=>"c"}], StructuredText::LabeledDelimitedReader.new("X,Y,Z\na,b,c").collect)
120
+ # Multiline
121
+ assert_equal([{"X"=>"a", "Y"=>"b", "Z"=>"c"}, {"X"=>"d", "Y"=>"e", "Z"=>"f"}],
122
+ StructuredText::LabeledDelimitedReader.new("X,Y,Z\na,b,c\nd,e,f").collect)
123
+ end
124
+
125
+ def test_exception
126
+ assert_raise(RuntimeError) { StructuredText::LabeledDelimitedReader.new("X,Y,Z\na,b,c,d").collect }
127
+ end
128
+ end
129
+
130
+
131
+ class ScenarioTestCase < Test::Unit::TestCase
132
+ def test_commented_labeled_text_with_all_custom_characters
133
+ text =<<-EOTEXT
134
+ ; This is the header row
135
+ Fruit|Color|Shape
136
+ apples|(red|green)|round ; The first data row
137
+ bananas|yellow|oblong
138
+ ; The end
139
+ EOTEXT
140
+ r = StructuredText::LabeledDelimitedReader.new(StructuredText::CommentedReader.new(text, ";"), "|", "(", ")")
141
+ assert_equal([{'Shape'=>'round ', 'Fruit'=>'apples', 'Color'=>'(red|green)'},
142
+ {'Shape'=>'oblong', 'Fruit'=>'bananas', 'Color'=>'yellow'}], r.collect)
143
+ end
144
+ end
metadata ADDED
@@ -0,0 +1,60 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: structuredtext
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - W.P. McNeill
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-08-11 00:00:00 -07:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description: This module provides utilities for working with various kinds of structured text. It includes comment handling and comma-separated-value (CSV) parsing with support for quoted strings.
17
+ email: billmcn@gmail.com
18
+ executables: []
19
+
20
+ extensions: []
21
+
22
+ extra_rdoc_files:
23
+ - README
24
+ files:
25
+ - test/test_structuredtext.rb
26
+ - lib/structuredtext.rb
27
+ - README
28
+ has_rdoc: true
29
+ homepage: http://structuredtext.rubyforge.org/
30
+ post_install_message:
31
+ rdoc_options:
32
+ - - --title
33
+ - StructuredText -- Structured Text Utilities
34
+ - --main
35
+ - README
36
+ - --line-numbers
37
+ - --inline-source
38
+ require_paths:
39
+ - lib
40
+ required_ruby_version: !ruby/object:Gem::Requirement
41
+ requirements:
42
+ - - ">="
43
+ - !ruby/object:Gem::Version
44
+ version: "0"
45
+ version:
46
+ required_rubygems_version: !ruby/object:Gem::Requirement
47
+ requirements:
48
+ - - ">="
49
+ - !ruby/object:Gem::Version
50
+ version: "0"
51
+ version:
52
+ requirements: []
53
+
54
+ rubyforge_project: structuredtext
55
+ rubygems_version: 1.1.1
56
+ signing_key:
57
+ specification_version: 2
58
+ summary: Structued text processing utilities
59
+ test_files:
60
+ - test/test_structuredtext.rb