strsyntax 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/strsyntax +42 -0
- data/lib/strsyntax.rb +2 -0
- data/lib/strsyntax/stdlib_ext.rb +53 -0
- data/lib/strsyntax/strsyntax.rb +119 -0
- data/test/test_parser.rb +30 -0
- metadata +50 -0
data/bin/strsyntax
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'optparse'
|
4
|
+
require 'strsyntax'
|
5
|
+
|
6
|
+
options = {
|
7
|
+
:command => :structure
|
8
|
+
}
|
9
|
+
|
10
|
+
opts = OptionParser.new do |opts|
|
11
|
+
opts.banner = "Usage: string_structure [options] word"
|
12
|
+
opts.separator ""
|
13
|
+
|
14
|
+
opts.on( "-y", "--syntax", "Splits string into syntax elements: [:cv,:vc,:vc]" ) do
|
15
|
+
options[:command] = :syntax
|
16
|
+
end
|
17
|
+
|
18
|
+
opts.on( "-p", "--parts", "Splits string into syntax instances: ['ro','ut','er']" ) do
|
19
|
+
options[:command] = :parts
|
20
|
+
end
|
21
|
+
|
22
|
+
opts.on( "-s", "--structure", "Splits string into syntax structure: [[:cv,'ro'],[:vc,'ut'],[:vc,'er']]" ) do
|
23
|
+
options[:command] = :structure
|
24
|
+
end
|
25
|
+
|
26
|
+
opts.separator ""
|
27
|
+
|
28
|
+
opts.parse!( ARGV )
|
29
|
+
end
|
30
|
+
|
31
|
+
str = opts.default_argv.first
|
32
|
+
|
33
|
+
case options[:command]
|
34
|
+
when :syntax
|
35
|
+
puts str.syntax.inspect
|
36
|
+
when :parts
|
37
|
+
puts str.parts.inspect
|
38
|
+
when :structure
|
39
|
+
puts str.structure.inspect
|
40
|
+
else
|
41
|
+
puts opts
|
42
|
+
end
|
data/lib/strsyntax.rb
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
#
|
2
|
+
# Extend standard library classes with methods required by Ngrams
|
3
|
+
#
|
4
|
+
if !String.respond_to? :syntax
|
5
|
+
class String
|
6
|
+
# Returns an array containing the syntax of the string in terms of consonant and vowel
|
7
|
+
# groups. The groups are CVC, VCV, CV, VC, C, and V. For example:
|
8
|
+
#
|
9
|
+
# "groucho".syntax => [:c,:cv,:vc,:cv]
|
10
|
+
# "harpo".syntax => [:cvc,:cv]
|
11
|
+
# "chico".syntax => [:c,:cvc,:v]
|
12
|
+
# "zeppo".syntax => [:cvc,:cv]
|
13
|
+
# "teasdale".syntax => [:cv,:vc,:cvc,:v]
|
14
|
+
def syntax
|
15
|
+
StringSyntax::Parser.parse( self )
|
16
|
+
end
|
17
|
+
end
|
18
|
+
else
|
19
|
+
raise "Cannot patch in String#syntax as it is already defined!"
|
20
|
+
end
|
21
|
+
|
22
|
+
if !String.respond_to? :parts
|
23
|
+
class String
|
24
|
+
# Return an array containing the constituent parts of the string as represented by
|
25
|
+
# its syntax. For example:
|
26
|
+
#
|
27
|
+
# "groucho".parts => ["g","ro","uc","ho"]
|
28
|
+
# "harpo".parts => ["har","po"]
|
29
|
+
# "chico".parts => ["c","hic", "o"]
|
30
|
+
# "zeppo".parts => ["zep","po"]
|
31
|
+
# "teasdale".parts => ["te","as","dal","e"]
|
32
|
+
def parts
|
33
|
+
StringSyntax::Parser.split( self )
|
34
|
+
end
|
35
|
+
end
|
36
|
+
else
|
37
|
+
raise "Cannot patch in String#parts as it is already defined!"
|
38
|
+
end
|
39
|
+
|
40
|
+
if !String.respond_to? :structure
|
41
|
+
class String
|
42
|
+
# Returns an array containing an amalgam of the syntax and parts of the string
|
43
|
+
# See #syntax and #parts for more information. Example
|
44
|
+
#
|
45
|
+
# "groucho".stmap => [[:c,"g"],[:cv,"ro"],[:vc,"uc"],[:cv,"ho"]]
|
46
|
+
#
|
47
|
+
def structure
|
48
|
+
self.syntax.zip( self.parts )
|
49
|
+
end
|
50
|
+
end
|
51
|
+
else
|
52
|
+
raise "Cannot patch in String#structure as it is already defined!"
|
53
|
+
end
|
@@ -0,0 +1,119 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# Can we decode any word into some combination of
|
4
|
+
# CV, VC, CVC, VCV
|
5
|
+
# ?
|
6
|
+
#
|
7
|
+
|
8
|
+
module StringSyntax
|
9
|
+
class Parser
|
10
|
+
C = /^([bcdfghjklmnpqrstvwxyz])/
|
11
|
+
V = /^([aeiou])/
|
12
|
+
CV = /^([bcdfghjklmnpqrstvwxyz][aeiou])/
|
13
|
+
CVC = /^([bcdfghjklmnpqrstvwxyz][aeiou][bcdfghjklmnpqrstvwxyz])/
|
14
|
+
VC = /^([aeiou][bcdfghjklmnpqrstvwxyz])/
|
15
|
+
VCV = /^([aeiou][bcdfghjklmnpqrstvwxyz][aeiou])/
|
16
|
+
|
17
|
+
SCHEMA_TEMPLATES = {
|
18
|
+
C => :c,
|
19
|
+
V => :v,
|
20
|
+
CV => :cv,
|
21
|
+
CVC => :cvc,
|
22
|
+
VC => :vc,
|
23
|
+
VCV => :vcv
|
24
|
+
}
|
25
|
+
|
26
|
+
# Returns an array containing the components of the string in terms of consonant
|
27
|
+
# and vowel groupings: CVC, VCV, CV, VC, C, V.
|
28
|
+
def self.parse( s )
|
29
|
+
s = s.downcase
|
30
|
+
|
31
|
+
structure = case s
|
32
|
+
when CVC
|
33
|
+
[ parse_subpart( CVC, s[0,3], s[(3..-1)] ), parse_subpart( CV, s[0,2], s[(2..-1)] ) ]
|
34
|
+
when CV
|
35
|
+
[ parse_subpart( CV, s[0,2], s[(2..-1)] ), parse_subpart( C, s[0,1], s[(1..-1)] ) ]
|
36
|
+
when C
|
37
|
+
[ parse_subpart( C, s[0,1], s[(1..-1)] ) ]
|
38
|
+
when VCV
|
39
|
+
[ parse_subpart( VCV, s[0,3], s[(3..-1)] ), parse_subpart( VC, s[0,2], s[(2..-1)] ) ]
|
40
|
+
when VC
|
41
|
+
[ parse_subpart( VC, s[0,2], s[(2..-1)] ), parse_subpart( V, s[0,1], s[(1..-1)] ) ]
|
42
|
+
when V
|
43
|
+
[ parse_subpart( V, s[0,1], s[(1..-1)] ) ]
|
44
|
+
else
|
45
|
+
[]
|
46
|
+
end
|
47
|
+
|
48
|
+
# Return the sub-structure containing the least number of stray :c and :v
|
49
|
+
structure.sort_by { |s| score( s ) }.first
|
50
|
+
end
|
51
|
+
|
52
|
+
# Return the number of :c and :v components in the structure
|
53
|
+
def self.score( s )
|
54
|
+
s.inject( 0 ) { |sum, e| e == :c || e == :v ? sum + 1 : sum }
|
55
|
+
end
|
56
|
+
|
57
|
+
def self.split( s, structure = parse( s ) )
|
58
|
+
templates = structure.map { |schema| template_from_schema( schema ) }
|
59
|
+
templates.map do |template|
|
60
|
+
match_data = template.match( s )
|
61
|
+
s = match_data.post_match
|
62
|
+
match_data[1]
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
def self.schema_from_template( template )
|
67
|
+
SCHEMA_TEMPLATES[template]
|
68
|
+
end
|
69
|
+
|
70
|
+
def self.template_from_schema( schema )
|
71
|
+
case schema
|
72
|
+
when :cvc
|
73
|
+
CVC
|
74
|
+
when :vcv
|
75
|
+
VCV
|
76
|
+
when :cv
|
77
|
+
CV
|
78
|
+
when :vc
|
79
|
+
VC
|
80
|
+
when :c
|
81
|
+
C
|
82
|
+
when :v
|
83
|
+
V
|
84
|
+
else
|
85
|
+
raise "Unknown schema: #{schema}"
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def self.schema_from_string( s )
|
90
|
+
case s
|
91
|
+
when CVC
|
92
|
+
:cvc
|
93
|
+
when CV
|
94
|
+
:cv
|
95
|
+
when VCV
|
96
|
+
:vcv
|
97
|
+
when VC
|
98
|
+
:vc
|
99
|
+
when C
|
100
|
+
:c
|
101
|
+
when V
|
102
|
+
:v
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
private
|
107
|
+
def self.parse_subpart( template, match, rest )
|
108
|
+
[ schema_from_template( template ), *parse( rest ) ].compact
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
if __FILE__ == $0
|
114
|
+
WORDS = %w( hello google amazon ookles paoga linguistics antidisestablishmentarianism )
|
115
|
+
|
116
|
+
WORDS.each do |word|
|
117
|
+
puts "#{word} -> #{ StringSyntax::Parser.parse( word ).inspect}"
|
118
|
+
end
|
119
|
+
end
|
data/test/test_parser.rb
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
$:<< File.join( File.dirname( __FILE__ ), '..', 'lib' )
|
2
|
+
|
3
|
+
require 'strsyntax'
|
4
|
+
require 'test/unit'
|
5
|
+
|
6
|
+
class TestParser < Test::Unit::TestCase
|
7
|
+
|
8
|
+
def test_syntax
|
9
|
+
assert_equal [:cvc,:cv], "hello".syntax
|
10
|
+
assert_equal [:vc,:c,:cvc,:vcv], "amplitude".syntax
|
11
|
+
assert_equal [:c,:cvc,:c,:cvc,:cvc], "transmitter".syntax
|
12
|
+
assert_equal [:cv,:vc,:cv], "google".syntax
|
13
|
+
assert_equal [:vcv,:cv,:cv,:vc], "aluminium".syntax
|
14
|
+
assert_equal [:cv,:vc,:cv,:cv], "teasdale".syntax
|
15
|
+
end
|
16
|
+
|
17
|
+
def test_parts
|
18
|
+
assert_equal ["hel","lo"], "hello".parts
|
19
|
+
assert_equal ["am","p","lit","ude"], "amplitude".parts
|
20
|
+
assert_equal ["t","ran","s","mit","ter"], "transmitter".parts
|
21
|
+
assert_equal ["go","og","le"], "google".parts
|
22
|
+
assert_equal ["alu","mi", "ni","um"], "aluminium".parts
|
23
|
+
assert_equal ["te","as","da","le"], "teasdale".parts
|
24
|
+
end
|
25
|
+
|
26
|
+
def test_structure
|
27
|
+
assert_equal [[:c,"g"],[:cv,"ro"],[:vc,"uc"],[:cv,"ho"]], "groucho".structure
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
metadata
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
rubygems_version: 0.9.0
|
3
|
+
specification_version: !int:Fixnum 1
|
4
|
+
name: strsyntax
|
5
|
+
version: !ruby/object:Gem::Version
|
6
|
+
version: 0.1.2
|
7
|
+
date: 2006-07-28 00:00:00 +01:00
|
8
|
+
summary: A library to return the syntax/structure of a word in term of consonant and vowel groups.
|
9
|
+
require_paths:
|
10
|
+
- lib
|
11
|
+
email: self@mattmower.com
|
12
|
+
homepage: http://rubyforge.org/projects/rubymatt/
|
13
|
+
rubyforge_project: rubymatt
|
14
|
+
description:
|
15
|
+
autorequire: strsyntax
|
16
|
+
default_executable:
|
17
|
+
bindir: bin
|
18
|
+
has_rdoc: true
|
19
|
+
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">"
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 0.0.0
|
24
|
+
version:
|
25
|
+
platform: ruby
|
26
|
+
signing_key:
|
27
|
+
cert_chain:
|
28
|
+
post_install_message:
|
29
|
+
authors:
|
30
|
+
- Matt Mower
|
31
|
+
files:
|
32
|
+
- lib/strsyntax.rb
|
33
|
+
- lib/strsyntax/stdlib_ext.rb
|
34
|
+
- lib/strsyntax/strsyntax.rb
|
35
|
+
- test/test_parser.rb
|
36
|
+
- bin/strsyntax
|
37
|
+
test_files: []
|
38
|
+
|
39
|
+
rdoc_options: []
|
40
|
+
|
41
|
+
extra_rdoc_files: []
|
42
|
+
|
43
|
+
executables:
|
44
|
+
- strsyntax
|
45
|
+
extensions: []
|
46
|
+
|
47
|
+
requirements: []
|
48
|
+
|
49
|
+
dependencies: []
|
50
|
+
|