twitter_cldr 3.0.1 → 3.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +4 -2
- data/History.txt +4 -0
- data/README.md +17 -6
- data/lib/twitter_cldr/resources/postal_codes_importer.rb +12 -1
- data/lib/twitter_cldr/resources/regexp_ast_generator.rb +41 -0
- data/lib/twitter_cldr/resources.rb +1 -0
- data/lib/twitter_cldr/shared/postal_code_generator.rb +50 -0
- data/lib/twitter_cldr/shared/postal_codes.rb +48 -9
- data/lib/twitter_cldr/shared.rb +15 -14
- data/lib/twitter_cldr/utils/regexp_ast.rb +115 -0
- data/lib/twitter_cldr/utils/regexp_sampler.rb +149 -0
- data/lib/twitter_cldr/utils.rb +5 -3
- data/lib/twitter_cldr/version.rb +1 -1
- data/resources/shared/postal_codes.yml +1442 -159
- data/spec/bidi/bidi_spec.rb +1 -1
- data/spec/collation/collation_spec.rb +1 -1
- data/spec/collation/collator_spec.rb +31 -31
- data/spec/collation/implicit_collation_elements_spec.rb +6 -6
- data/spec/collation/sort_key_builder_spec.rb +28 -26
- data/spec/collation/tailoring_spec.rb +1 -1
- data/spec/collation/trie_builder_spec.rb +16 -16
- data/spec/collation/trie_dumps_spec.rb +2 -2
- data/spec/collation/trie_loader_spec.rb +8 -8
- data/spec/collation/trie_spec.rb +61 -61
- data/spec/collation/trie_with_fallback_spec.rb +5 -5
- data/spec/core_ext_spec.rb +1 -1
- data/spec/data_readers/additional_date_format_selector_spec.rb +38 -38
- data/spec/data_readers/date_time_data_reader_spec.rb +2 -2
- data/spec/data_readers/number_data_reader_spec.rb +1 -1
- data/spec/formatters/calendars/datetime_formatter_spec.rb +218 -218
- data/spec/formatters/list_formatter_spec.rb +8 -8
- data/spec/formatters/numbers/abbreviated/abbreviated_number_formatter_spec.rb +14 -14
- data/spec/formatters/numbers/abbreviated/long_decimal_formatter_spec.rb +4 -4
- data/spec/formatters/numbers/abbreviated/short_decimal_formatter_spec.rb +4 -4
- data/spec/formatters/numbers/currency_formatter_spec.rb +11 -11
- data/spec/formatters/numbers/decimal_formatter_spec.rb +3 -3
- data/spec/formatters/numbers/helpers/fraction_spec.rb +3 -3
- data/spec/formatters/numbers/helpers/integer_spec.rb +16 -16
- data/spec/formatters/numbers/number_formatter_spec.rb +21 -21
- data/spec/formatters/numbers/percent_formatter_spec.rb +3 -3
- data/spec/formatters/numbers/rbnf/rbnf_spec.rb +2 -2
- data/spec/formatters/plurals/plural_formatter_spec.rb +41 -41
- data/spec/formatters/plurals/rules_spec.rb +13 -13
- data/spec/localized/localized_array_spec.rb +12 -12
- data/spec/localized/localized_date_spec.rb +33 -33
- data/spec/localized/localized_datetime_spec.rb +11 -11
- data/spec/localized/localized_hash_spec.rb +4 -4
- data/spec/localized/localized_number_spec.rb +36 -36
- data/spec/localized/localized_object_spec.rb +8 -8
- data/spec/localized/localized_string_spec.rb +53 -53
- data/spec/localized/localized_symbol_spec.rb +9 -9
- data/spec/localized/localized_time_spec.rb +10 -10
- data/spec/localized/localized_timespan_spec.rb +8 -8
- data/spec/normalization_spec.rb +6 -6
- data/spec/parsers/number_parser_spec.rb +36 -36
- data/spec/parsers/parser_spec.rb +5 -5
- data/spec/parsers/segmentation_parser_spec.rb +19 -19
- data/spec/parsers/symbol_table_spec.rb +4 -4
- data/spec/parsers/unicode_regex/character_class_spec.rb +19 -19
- data/spec/parsers/unicode_regex/character_range_spec.rb +1 -1
- data/spec/parsers/unicode_regex/character_set_spec.rb +8 -8
- data/spec/parsers/unicode_regex/literal_spec.rb +5 -5
- data/spec/parsers/unicode_regex/unicode_string_spec.rb +2 -2
- data/spec/parsers/unicode_regex_parser_spec.rb +28 -28
- data/spec/resources/loader_spec.rb +32 -32
- data/spec/shared/break_iterator_spec.rb +13 -13
- data/spec/shared/calendar_spec.rb +59 -59
- data/spec/shared/casefolder_spec.rb +5 -5
- data/spec/shared/code_point_spec.rb +46 -46
- data/spec/shared/currencies_spec.rb +7 -7
- data/spec/shared/language_codes_spec.rb +34 -34
- data/spec/shared/languages_spec.rb +30 -30
- data/spec/shared/numbering_system_spec.rb +7 -7
- data/spec/shared/numbers_spec.rb +4 -4
- data/spec/shared/phone_codes_spec.rb +7 -7
- data/spec/shared/postal_code_generator_spec.rb +76 -0
- data/spec/shared/postal_codes_spec.rb +35 -29
- data/spec/shared/territories_spec.rb +40 -40
- data/spec/shared/unicode_regex_spec.rb +71 -71
- data/spec/spec_helper.rb +2 -2
- data/spec/tokenizers/calendars/date_tokenizer_spec.rb +1 -1
- data/spec/tokenizers/calendars/timespan_tokenizer_spec.rb +6 -6
- data/spec/tokenizers/composite_token_spec.rb +3 -3
- data/spec/tokenizers/token_spec.rb +3 -3
- data/spec/twitter_cldr_spec.rb +72 -72
- data/spec/utils/code_points_spec.rb +10 -10
- data/spec/utils/interpolation_spec.rb +32 -32
- data/spec/utils/range_set_spec.rb +36 -36
- data/spec/utils/regexp_ast_spec.rb +44 -0
- data/spec/utils/regexp_sampler_spec.rb +182 -0
- data/spec/utils/yaml/yaml_spec.rb +23 -23
- data/spec/utils_spec.rb +19 -19
- metadata +263 -258
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a4b16ad78f734a596fa4488a61e58b0e971cf2ce
|
4
|
+
data.tar.gz: 1179820a11c171adab4e841cf14efe0e6534a6a7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e4268cc2df5dda027c6b3d2fad2398a0ac5f0597fc620931f0e86cc4dabc775a3d6d0b731290a042ec1114f088244bbd8cc457f2a36066ba8c7c5d8b2810f0b3
|
7
|
+
data.tar.gz: 635c217f690dabc37991f4eff3d215e1312c66a543b2aa7e665e0e3469f0cd10bd37fee2452901ba8c5665f331b0cac2cd059a93f451f03f9118dfc1b97e1217
|
data/Gemfile
CHANGED
@@ -14,6 +14,8 @@ group :development, :test do
|
|
14
14
|
if RUBY_VERSION <= "1.8.7"
|
15
15
|
gem 'oniguruma'
|
16
16
|
end
|
17
|
+
|
18
|
+
gem 'regexp_parser', '~> 0.1.5'
|
17
19
|
end
|
18
20
|
|
19
21
|
group :development do
|
@@ -25,8 +27,8 @@ group :development do
|
|
25
27
|
end
|
26
28
|
|
27
29
|
group :test do
|
28
|
-
gem 'rspec', '~> 2.
|
29
|
-
gem 'rr', '~> 1.
|
30
|
+
gem 'rspec', '~> 2.14.0'
|
31
|
+
gem 'rr', '~> 1.1.2'
|
30
32
|
|
31
33
|
if RUBY_VERSION >= "1.9"
|
32
34
|
gem 'rubyzip'
|
data/History.txt
CHANGED
data/README.md
CHANGED
@@ -439,17 +439,21 @@ The CLDR contains postal code validation regexes for a number of countries.
|
|
439
439
|
|
440
440
|
```ruby
|
441
441
|
# United States
|
442
|
-
TwitterCldr::Shared::PostalCodes.
|
443
|
-
|
442
|
+
postal_code = TwitterCldr::Shared::PostalCodes.for_territory(:us)
|
443
|
+
postal_code.valid?("94103") # true
|
444
|
+
postal_code.valid?("9410") # false
|
444
445
|
|
445
446
|
# England (Great Britain)
|
446
|
-
TwitterCldr::Shared::PostalCodes.
|
447
|
+
postal_code = TwitterCldr::Shared::PostalCodes.for_territory(:gb)
|
448
|
+
postal_code.valid?("BS98 1TL") # true
|
447
449
|
|
448
450
|
# Sweden
|
449
|
-
TwitterCldr::Shared::PostalCodes.
|
451
|
+
postal_code = TwitterCldr::Shared::PostalCodes.for_territory(:se)
|
452
|
+
postal_code.valid?("280 12") # true
|
450
453
|
|
451
454
|
# Canada
|
452
|
-
TwitterCldr::Shared::PostalCodes.
|
455
|
+
postal_code = TwitterCldr::Shared::PostalCodes.for_territory(:ca)
|
456
|
+
postal_code.valid?("V3H 1Z7") # true
|
453
457
|
```
|
454
458
|
|
455
459
|
Get a list of supported territories by using the `#territories` method:
|
@@ -461,7 +465,14 @@ TwitterCldr::Shared::PostalCodes.territories # [:ad, :am, :ar, :as, :at, ... ]
|
|
461
465
|
Just want the regex? No problem:
|
462
466
|
|
463
467
|
```ruby
|
464
|
-
TwitterCldr::Shared::PostalCodes.
|
468
|
+
postal_code = TwitterCldr::Shared::PostalCodes.for_territory(:us)
|
469
|
+
postal_code.regexp # /\d{5}([ \-]\d{4})?/
|
470
|
+
```
|
471
|
+
|
472
|
+
Get a sample of valid postal codes with the `#sample` method:
|
473
|
+
|
474
|
+
```ruby
|
475
|
+
postal_code.sample(5) # ["83526", "31748-8754", "55851", "25788-4914", "55335"]
|
465
476
|
```
|
466
477
|
|
467
478
|
### Phone Codes
|
@@ -35,7 +35,18 @@ module TwitterCldr
|
|
35
35
|
|
36
36
|
postal_codes = Hash[postal_codes.sort_by(&:first)]
|
37
37
|
|
38
|
-
|
38
|
+
postal_codes = postal_codes.each_with_object({}) do |(territory, regex), memo|
|
39
|
+
memo[territory] = {
|
40
|
+
:regex => regex,
|
41
|
+
:ast => TwitterCldr::Utils::RegexpAst.dump(
|
42
|
+
RegexpAstGenerator.generate(regex.source)
|
43
|
+
)
|
44
|
+
}
|
45
|
+
end
|
46
|
+
|
47
|
+
File.open(File.join(@output_path, 'postal_codes.yml'), 'w') do |output|
|
48
|
+
output.write(YAML.dump(postal_codes))
|
49
|
+
end
|
39
50
|
end
|
40
51
|
|
41
52
|
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
# Copyright 2012 Twitter, Inc
|
4
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
5
|
+
|
6
|
+
require 'regexp_parser'
|
7
|
+
|
8
|
+
module TwitterCldr
|
9
|
+
module Resources
|
10
|
+
|
11
|
+
class RegexpAstGenerator
|
12
|
+
class << self
|
13
|
+
|
14
|
+
def generate(regexp_str)
|
15
|
+
tree = Regexp::Parser.parse(regexp_str)
|
16
|
+
walk(tree)
|
17
|
+
end
|
18
|
+
|
19
|
+
private
|
20
|
+
|
21
|
+
def walk(node)
|
22
|
+
expressions = if node.respond_to?(:expressions)
|
23
|
+
node.expressions.map { |expr| walk(expr) }
|
24
|
+
else
|
25
|
+
[]
|
26
|
+
end
|
27
|
+
|
28
|
+
class_for(node).from_parser_node(node, expressions)
|
29
|
+
end
|
30
|
+
|
31
|
+
def class_for(klass)
|
32
|
+
TwitterCldr::Utils::RegexpAst.const_get(
|
33
|
+
klass.class.to_s.split("::").last.to_sym
|
34
|
+
)
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
41
|
+
end
|
@@ -26,5 +26,6 @@ module TwitterCldr
|
|
26
26
|
autoload :RbnfTestImporter, 'twitter_cldr/resources/rbnf_test_importer'
|
27
27
|
autoload :ReadmeRenderer, 'twitter_cldr/resources/readme_renderer'
|
28
28
|
autoload :CasefolderClassGenerator, 'twitter_cldr/resources/casefolder_class_generator'
|
29
|
+
autoload :RegexpAstGenerator, 'twitter_cldr/resources/regexp_ast_generator'
|
29
30
|
end
|
30
31
|
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
# Copyright 2012 Twitter, Inc
|
4
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
5
|
+
|
6
|
+
require 'set'
|
7
|
+
|
8
|
+
module TwitterCldr
|
9
|
+
module Shared
|
10
|
+
class PostalCodeGenerator
|
11
|
+
|
12
|
+
SAMPLE_MULTIPLIER = 4
|
13
|
+
|
14
|
+
def initialize(regexp_ast)
|
15
|
+
@regexp_generator = TwitterCldr::Utils::RegexpSampler.new(regexp_ast)
|
16
|
+
end
|
17
|
+
|
18
|
+
def generate
|
19
|
+
clean_result(@regexp_generator.generate)
|
20
|
+
end
|
21
|
+
|
22
|
+
def sample(sample_size = 1)
|
23
|
+
sample_set = Set.new
|
24
|
+
counter = 1
|
25
|
+
|
26
|
+
until sample_set.size == sample_size
|
27
|
+
sample = generate
|
28
|
+
sample_set << sample unless sample.empty?
|
29
|
+
counter += 1
|
30
|
+
|
31
|
+
# Stop if the number of attempted generations is
|
32
|
+
# n times more than requested. Some territories only
|
33
|
+
# have one postal code, so if the user asks for 10
|
34
|
+
# they'll get an infinite loop.
|
35
|
+
break if counter > sample_size * SAMPLE_MULTIPLIER
|
36
|
+
end
|
37
|
+
|
38
|
+
sample_set.to_a
|
39
|
+
end
|
40
|
+
|
41
|
+
private
|
42
|
+
|
43
|
+
# remove spaces that trail a dash
|
44
|
+
def clean_result(str)
|
45
|
+
str.gsub(/- /, '-')
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
@@ -5,7 +5,10 @@
|
|
5
5
|
|
6
6
|
module TwitterCldr
|
7
7
|
module Shared
|
8
|
-
|
8
|
+
|
9
|
+
class InvalidTerritoryError < StandardError; end
|
10
|
+
|
11
|
+
class PostalCodes
|
9
12
|
|
10
13
|
class << self
|
11
14
|
|
@@ -13,23 +16,59 @@ module TwitterCldr
|
|
13
16
|
resource.keys
|
14
17
|
end
|
15
18
|
|
16
|
-
def
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
19
|
+
def for_territory(territory)
|
20
|
+
key = territory.to_s.downcase.to_sym
|
21
|
+
if res = resource[key]
|
22
|
+
territory_cache[key] ||= begin
|
23
|
+
new(
|
24
|
+
territory,
|
25
|
+
res[:regex],
|
26
|
+
TwitterCldr::Utils::RegexpAst.load(res[:ast])
|
27
|
+
)
|
28
|
+
end
|
29
|
+
else
|
30
|
+
raise InvalidTerritoryError, "invalid territory"
|
31
|
+
end
|
23
32
|
end
|
24
33
|
|
25
34
|
private
|
26
35
|
|
36
|
+
def territory_cache
|
37
|
+
@territory_cache ||= {}
|
38
|
+
end
|
39
|
+
|
27
40
|
def resource
|
28
41
|
@resource ||= TwitterCldr.get_resource(:shared, :postal_codes)
|
29
42
|
end
|
30
43
|
|
31
44
|
end
|
32
45
|
|
46
|
+
attr_reader :territory, :regexp, :ast
|
47
|
+
|
48
|
+
def initialize(territory, regexp, ast)
|
49
|
+
@territory = territory
|
50
|
+
@regexp = regexp
|
51
|
+
@ast = ast
|
52
|
+
end
|
53
|
+
|
54
|
+
def valid?(postal_code)
|
55
|
+
!!(regexp && regexp =~ postal_code)
|
56
|
+
end
|
57
|
+
|
58
|
+
def sample(sample_size = 1)
|
59
|
+
generator.sample(sample_size)
|
60
|
+
end
|
61
|
+
|
62
|
+
private
|
63
|
+
|
64
|
+
def generator
|
65
|
+
generator_cache[territory] ||= PostalCodeGenerator.new(ast)
|
66
|
+
end
|
67
|
+
|
68
|
+
def generator_cache
|
69
|
+
@@generator_cache ||= {}
|
70
|
+
end
|
71
|
+
|
33
72
|
end
|
34
73
|
end
|
35
|
-
end
|
74
|
+
end
|
data/lib/twitter_cldr/shared.rb
CHANGED
@@ -5,19 +5,20 @@
|
|
5
5
|
|
6
6
|
module TwitterCldr
|
7
7
|
module Shared
|
8
|
-
autoload :Calendar,
|
9
|
-
autoload :CodePoint,
|
10
|
-
autoload :Currencies,
|
11
|
-
autoload :LanguageCodes,
|
12
|
-
autoload :Languages,
|
13
|
-
autoload :Numbers,
|
14
|
-
autoload :PhoneCodes,
|
15
|
-
autoload :PostalCodes,
|
16
|
-
autoload :
|
17
|
-
autoload :
|
18
|
-
autoload :
|
19
|
-
autoload :
|
20
|
-
autoload :
|
21
|
-
autoload :
|
8
|
+
autoload :Calendar, 'twitter_cldr/shared/calendar'
|
9
|
+
autoload :CodePoint, 'twitter_cldr/shared/code_point'
|
10
|
+
autoload :Currencies, 'twitter_cldr/shared/currencies'
|
11
|
+
autoload :LanguageCodes, 'twitter_cldr/shared/language_codes'
|
12
|
+
autoload :Languages, 'twitter_cldr/shared/languages'
|
13
|
+
autoload :Numbers, 'twitter_cldr/shared/numbers'
|
14
|
+
autoload :PhoneCodes, 'twitter_cldr/shared/phone_codes'
|
15
|
+
autoload :PostalCodes, 'twitter_cldr/shared/postal_codes'
|
16
|
+
autoload :PostalCodeGenerator, 'twitter_cldr/shared/postal_code_generator'
|
17
|
+
autoload :Bidi, 'twitter_cldr/shared/bidi'
|
18
|
+
autoload :Territories, 'twitter_cldr/shared/territories'
|
19
|
+
autoload :NumberingSystem, 'twitter_cldr/shared/numbering_system'
|
20
|
+
autoload :Casefolder, 'twitter_cldr/shared/casefolder'
|
21
|
+
autoload :UnicodeRegex, 'twitter_cldr/shared/unicode_regex'
|
22
|
+
autoload :BreakIterator, 'twitter_cldr/shared/break_iterator'
|
22
23
|
end
|
23
24
|
end
|
@@ -0,0 +1,115 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
# Copyright 2012 Twitter, Inc
|
4
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
5
|
+
|
6
|
+
require 'base64'
|
7
|
+
|
8
|
+
module TwitterCldr
|
9
|
+
module Utils
|
10
|
+
module RegexpAst
|
11
|
+
|
12
|
+
def self.load(ast_str)
|
13
|
+
Marshal.load(Base64.decode64(ast_str))
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.dump(ast)
|
17
|
+
Base64.encode64(Marshal.dump(ast))
|
18
|
+
end
|
19
|
+
|
20
|
+
class Node
|
21
|
+
attr_reader :expressions, :quantifier
|
22
|
+
|
23
|
+
def initialize(expressions, quantifier)
|
24
|
+
@expressions = expressions
|
25
|
+
@quantifier = quantifier
|
26
|
+
end
|
27
|
+
|
28
|
+
def quantified?
|
29
|
+
!!quantifier
|
30
|
+
end
|
31
|
+
|
32
|
+
def self.from_parser_node(node, expressions)
|
33
|
+
new(expressions, Quantifier.from_parser_node(node))
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
class CharacterSet < Node
|
38
|
+
attr_reader :members, :negated
|
39
|
+
alias :negated? :negated
|
40
|
+
|
41
|
+
def initialize(expressions, quantifier, members, negated)
|
42
|
+
@members = members; @negated = negated
|
43
|
+
super(expressions, quantifier)
|
44
|
+
end
|
45
|
+
|
46
|
+
def self.from_parser_node(node, expressions)
|
47
|
+
new(
|
48
|
+
expressions, Quantifier.from_parser_node(node),
|
49
|
+
fix_members(node.members), node.negative?
|
50
|
+
)
|
51
|
+
end
|
52
|
+
|
53
|
+
private
|
54
|
+
|
55
|
+
# CLDR occasionally uses \d and other escapes in character classes
|
56
|
+
# to signify 0-9 and friends. This is legal regex syntax, but the
|
57
|
+
# regexp_parser gem doesn't handle it correctly, so we have to
|
58
|
+
# repair things here.
|
59
|
+
def self.fix_members(members)
|
60
|
+
members.join.scan(/(\\[wd]|\w-\w|\w|-)/).to_a.flatten.inject([]) do |ret, member|
|
61
|
+
case member
|
62
|
+
when '\d' then ret << '0-9'
|
63
|
+
when '\w' then ret += ['A-Z', 'a-z', '0-9', '_']
|
64
|
+
else ret << member
|
65
|
+
end
|
66
|
+
|
67
|
+
ret
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
class Literal < Node
|
73
|
+
attr_reader :text
|
74
|
+
|
75
|
+
def initialize(expressions, quantifier, text)
|
76
|
+
@text = text
|
77
|
+
super(expressions, quantifier)
|
78
|
+
end
|
79
|
+
|
80
|
+
def self.from_parser_node(node, expressions)
|
81
|
+
new(
|
82
|
+
expressions, Quantifier.from_parser_node(node), node.text
|
83
|
+
)
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
class Quantifier
|
88
|
+
attr_reader :max, :min
|
89
|
+
|
90
|
+
def initialize(max, min)
|
91
|
+
@max = max; @min = min
|
92
|
+
end
|
93
|
+
|
94
|
+
def self.from_parser_node(node)
|
95
|
+
if node.quantifier
|
96
|
+
new(
|
97
|
+
node.quantifier.max,
|
98
|
+
node.quantifier.min
|
99
|
+
)
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
class EscapeSequence < Literal; end
|
105
|
+
class Word < Node; end
|
106
|
+
class Digit < Node; end
|
107
|
+
class Sequence < Node; end
|
108
|
+
class Alternation < Node; end
|
109
|
+
class Capture < Node; end
|
110
|
+
class Passive < Node; end
|
111
|
+
class Root < Node; end
|
112
|
+
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
@@ -0,0 +1,149 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
# Copyright 2012 Twitter, Inc
|
4
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
5
|
+
|
6
|
+
module TwitterCldr
|
7
|
+
module Utils
|
8
|
+
|
9
|
+
# Generates a valid string that would match the given regexp ast.
|
10
|
+
class RegexpSampler
|
11
|
+
|
12
|
+
attr_reader :regexp_ast
|
13
|
+
|
14
|
+
DIGITS = ('0'..'9').to_a
|
15
|
+
WORD_LETTERS = ('a'..'z').to_a + ('A'..'Z').to_a + ['_']
|
16
|
+
|
17
|
+
def initialize(regexp_ast)
|
18
|
+
@regexp_ast = regexp_ast
|
19
|
+
end
|
20
|
+
|
21
|
+
def generate
|
22
|
+
walk_children(regexp_ast)
|
23
|
+
end
|
24
|
+
|
25
|
+
private
|
26
|
+
|
27
|
+
def walk(node)
|
28
|
+
method = :"walk_#{class_name_for(node)}"
|
29
|
+
puts method unless respond_to?(method, true)
|
30
|
+
respond_to?(method, true) ? send(method, node) : ""
|
31
|
+
end
|
32
|
+
|
33
|
+
def walk_children(node)
|
34
|
+
node.expressions.map { |expr| walk(expr) }.join
|
35
|
+
end
|
36
|
+
|
37
|
+
def walk_digit(node)
|
38
|
+
if node.quantified?
|
39
|
+
quantifier_sample(DIGITS, node.quantifier)
|
40
|
+
else
|
41
|
+
[single_sample(DIGITS)]
|
42
|
+
end.join + walk_children(node)
|
43
|
+
end
|
44
|
+
|
45
|
+
def walk_word(node)
|
46
|
+
if node.quantified?
|
47
|
+
quantifier_sample(WORD_LETTERS, node.quantifier)
|
48
|
+
else
|
49
|
+
[single_sample(WORD_LETTERS)]
|
50
|
+
end.join + walk_children(node)
|
51
|
+
end
|
52
|
+
|
53
|
+
def walk_literal(node)
|
54
|
+
node.text * if node.quantified?
|
55
|
+
rand_in_quantifier(node.quantifier)
|
56
|
+
else
|
57
|
+
1
|
58
|
+
end + walk_children(node)
|
59
|
+
end
|
60
|
+
|
61
|
+
def walk_character_set(node)
|
62
|
+
charset = expand_charset(node.members)
|
63
|
+
|
64
|
+
if node.quantified?
|
65
|
+
quantifier_sample(charset, node.quantifier)
|
66
|
+
else
|
67
|
+
[single_sample(charset)]
|
68
|
+
end.join + walk_children(node)
|
69
|
+
end
|
70
|
+
|
71
|
+
def walk_capture(node)
|
72
|
+
if node.quantified?
|
73
|
+
rand_in_quantifier(node.quantifier).times.map do
|
74
|
+
walk_children(node)
|
75
|
+
end.join
|
76
|
+
else
|
77
|
+
walk_children(node)
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
# "passive" means non-capturing group.
|
82
|
+
# Since we don't need to distinguish between
|
83
|
+
# captures/non-captures, we can just delegate
|
84
|
+
# to the walk_capture method.
|
85
|
+
def walk_passive(node)
|
86
|
+
walk_capture(node)
|
87
|
+
end
|
88
|
+
|
89
|
+
def walk_alternation(node)
|
90
|
+
if node.quantified?
|
91
|
+
rand_in_quantifier(node.quantifier).times.map do
|
92
|
+
walk(single_sample(node.expressions))
|
93
|
+
end.join
|
94
|
+
else
|
95
|
+
walk(single_sample(node.expressions))
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
def walk_sequence(node)
|
100
|
+
if node.quantified?
|
101
|
+
rand_in_quantifier(node.quantifier).times.map do
|
102
|
+
node.expressions.map { |expr| walk(expr) }.join
|
103
|
+
end.join
|
104
|
+
else
|
105
|
+
node.expressions.map { |expr| walk(expr) }.join
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
def expand_charset(members)
|
110
|
+
members.inject([]) do |ret, member|
|
111
|
+
ret + expand_charset_member(member)
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
def expand_charset_member(member)
|
116
|
+
left, right = member.scan(/([^\\])-?/).flatten
|
117
|
+
right ? (left..right).to_a : [left]
|
118
|
+
end
|
119
|
+
|
120
|
+
def quantifier_sample(arr, quantifier)
|
121
|
+
sample_size = if quantifier.min == quantifier.max
|
122
|
+
quantifier.min
|
123
|
+
else
|
124
|
+
rand_in_quantifier(quantifier)
|
125
|
+
end
|
126
|
+
|
127
|
+
sample_size.times.map { single_sample(arr) }
|
128
|
+
end
|
129
|
+
|
130
|
+
def single_sample(arr)
|
131
|
+
arr[rand(arr.size)]
|
132
|
+
end
|
133
|
+
|
134
|
+
def rand_in_quantifier(quantifier)
|
135
|
+
rand_in_range(quantifier.min, quantifier.max)
|
136
|
+
end
|
137
|
+
|
138
|
+
def rand_in_range(min, max)
|
139
|
+
min + rand((max - min) + 1)
|
140
|
+
end
|
141
|
+
|
142
|
+
def class_name_for(node)
|
143
|
+
name = node.class.to_s.split("::").last
|
144
|
+
name.gsub(/\A|([A-Z])/) { $1 ? "_#{$1.downcase}" : "" }.downcase
|
145
|
+
end
|
146
|
+
|
147
|
+
end
|
148
|
+
end
|
149
|
+
end
|
data/lib/twitter_cldr/utils.rb
CHANGED
@@ -6,9 +6,11 @@
|
|
6
6
|
module TwitterCldr
|
7
7
|
module Utils
|
8
8
|
|
9
|
-
autoload :CodePoints,
|
10
|
-
autoload :YAML,
|
11
|
-
autoload :RangeSet,
|
9
|
+
autoload :CodePoints, 'twitter_cldr/utils/code_points'
|
10
|
+
autoload :YAML, 'twitter_cldr/utils/yaml'
|
11
|
+
autoload :RangeSet, 'twitter_cldr/utils/range_set'
|
12
|
+
autoload :RegexpAst, 'twitter_cldr/utils/regexp_ast'
|
13
|
+
autoload :RegexpSampler, 'twitter_cldr/utils/regexp_sampler'
|
12
14
|
|
13
15
|
class << self
|
14
16
|
|
data/lib/twitter_cldr/version.rb
CHANGED