twitter_cldr 3.0.1 → 3.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +4 -2
- data/History.txt +4 -0
- data/README.md +17 -6
- data/lib/twitter_cldr/resources/postal_codes_importer.rb +12 -1
- data/lib/twitter_cldr/resources/regexp_ast_generator.rb +41 -0
- data/lib/twitter_cldr/resources.rb +1 -0
- data/lib/twitter_cldr/shared/postal_code_generator.rb +50 -0
- data/lib/twitter_cldr/shared/postal_codes.rb +48 -9
- data/lib/twitter_cldr/shared.rb +15 -14
- data/lib/twitter_cldr/utils/regexp_ast.rb +115 -0
- data/lib/twitter_cldr/utils/regexp_sampler.rb +149 -0
- data/lib/twitter_cldr/utils.rb +5 -3
- data/lib/twitter_cldr/version.rb +1 -1
- data/resources/shared/postal_codes.yml +1442 -159
- data/spec/bidi/bidi_spec.rb +1 -1
- data/spec/collation/collation_spec.rb +1 -1
- data/spec/collation/collator_spec.rb +31 -31
- data/spec/collation/implicit_collation_elements_spec.rb +6 -6
- data/spec/collation/sort_key_builder_spec.rb +28 -26
- data/spec/collation/tailoring_spec.rb +1 -1
- data/spec/collation/trie_builder_spec.rb +16 -16
- data/spec/collation/trie_dumps_spec.rb +2 -2
- data/spec/collation/trie_loader_spec.rb +8 -8
- data/spec/collation/trie_spec.rb +61 -61
- data/spec/collation/trie_with_fallback_spec.rb +5 -5
- data/spec/core_ext_spec.rb +1 -1
- data/spec/data_readers/additional_date_format_selector_spec.rb +38 -38
- data/spec/data_readers/date_time_data_reader_spec.rb +2 -2
- data/spec/data_readers/number_data_reader_spec.rb +1 -1
- data/spec/formatters/calendars/datetime_formatter_spec.rb +218 -218
- data/spec/formatters/list_formatter_spec.rb +8 -8
- data/spec/formatters/numbers/abbreviated/abbreviated_number_formatter_spec.rb +14 -14
- data/spec/formatters/numbers/abbreviated/long_decimal_formatter_spec.rb +4 -4
- data/spec/formatters/numbers/abbreviated/short_decimal_formatter_spec.rb +4 -4
- data/spec/formatters/numbers/currency_formatter_spec.rb +11 -11
- data/spec/formatters/numbers/decimal_formatter_spec.rb +3 -3
- data/spec/formatters/numbers/helpers/fraction_spec.rb +3 -3
- data/spec/formatters/numbers/helpers/integer_spec.rb +16 -16
- data/spec/formatters/numbers/number_formatter_spec.rb +21 -21
- data/spec/formatters/numbers/percent_formatter_spec.rb +3 -3
- data/spec/formatters/numbers/rbnf/rbnf_spec.rb +2 -2
- data/spec/formatters/plurals/plural_formatter_spec.rb +41 -41
- data/spec/formatters/plurals/rules_spec.rb +13 -13
- data/spec/localized/localized_array_spec.rb +12 -12
- data/spec/localized/localized_date_spec.rb +33 -33
- data/spec/localized/localized_datetime_spec.rb +11 -11
- data/spec/localized/localized_hash_spec.rb +4 -4
- data/spec/localized/localized_number_spec.rb +36 -36
- data/spec/localized/localized_object_spec.rb +8 -8
- data/spec/localized/localized_string_spec.rb +53 -53
- data/spec/localized/localized_symbol_spec.rb +9 -9
- data/spec/localized/localized_time_spec.rb +10 -10
- data/spec/localized/localized_timespan_spec.rb +8 -8
- data/spec/normalization_spec.rb +6 -6
- data/spec/parsers/number_parser_spec.rb +36 -36
- data/spec/parsers/parser_spec.rb +5 -5
- data/spec/parsers/segmentation_parser_spec.rb +19 -19
- data/spec/parsers/symbol_table_spec.rb +4 -4
- data/spec/parsers/unicode_regex/character_class_spec.rb +19 -19
- data/spec/parsers/unicode_regex/character_range_spec.rb +1 -1
- data/spec/parsers/unicode_regex/character_set_spec.rb +8 -8
- data/spec/parsers/unicode_regex/literal_spec.rb +5 -5
- data/spec/parsers/unicode_regex/unicode_string_spec.rb +2 -2
- data/spec/parsers/unicode_regex_parser_spec.rb +28 -28
- data/spec/resources/loader_spec.rb +32 -32
- data/spec/shared/break_iterator_spec.rb +13 -13
- data/spec/shared/calendar_spec.rb +59 -59
- data/spec/shared/casefolder_spec.rb +5 -5
- data/spec/shared/code_point_spec.rb +46 -46
- data/spec/shared/currencies_spec.rb +7 -7
- data/spec/shared/language_codes_spec.rb +34 -34
- data/spec/shared/languages_spec.rb +30 -30
- data/spec/shared/numbering_system_spec.rb +7 -7
- data/spec/shared/numbers_spec.rb +4 -4
- data/spec/shared/phone_codes_spec.rb +7 -7
- data/spec/shared/postal_code_generator_spec.rb +76 -0
- data/spec/shared/postal_codes_spec.rb +35 -29
- data/spec/shared/territories_spec.rb +40 -40
- data/spec/shared/unicode_regex_spec.rb +71 -71
- data/spec/spec_helper.rb +2 -2
- data/spec/tokenizers/calendars/date_tokenizer_spec.rb +1 -1
- data/spec/tokenizers/calendars/timespan_tokenizer_spec.rb +6 -6
- data/spec/tokenizers/composite_token_spec.rb +3 -3
- data/spec/tokenizers/token_spec.rb +3 -3
- data/spec/twitter_cldr_spec.rb +72 -72
- data/spec/utils/code_points_spec.rb +10 -10
- data/spec/utils/interpolation_spec.rb +32 -32
- data/spec/utils/range_set_spec.rb +36 -36
- data/spec/utils/regexp_ast_spec.rb +44 -0
- data/spec/utils/regexp_sampler_spec.rb +182 -0
- data/spec/utils/yaml/yaml_spec.rb +23 -23
- data/spec/utils_spec.rb +19 -19
- metadata +263 -258
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA1:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: a4b16ad78f734a596fa4488a61e58b0e971cf2ce
|
|
4
|
+
data.tar.gz: 1179820a11c171adab4e841cf14efe0e6534a6a7
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: e4268cc2df5dda027c6b3d2fad2398a0ac5f0597fc620931f0e86cc4dabc775a3d6d0b731290a042ec1114f088244bbd8cc457f2a36066ba8c7c5d8b2810f0b3
|
|
7
|
+
data.tar.gz: 635c217f690dabc37991f4eff3d215e1312c66a543b2aa7e665e0e3469f0cd10bd37fee2452901ba8c5665f331b0cac2cd059a93f451f03f9118dfc1b97e1217
|
data/Gemfile
CHANGED
|
@@ -14,6 +14,8 @@ group :development, :test do
|
|
|
14
14
|
if RUBY_VERSION <= "1.8.7"
|
|
15
15
|
gem 'oniguruma'
|
|
16
16
|
end
|
|
17
|
+
|
|
18
|
+
gem 'regexp_parser', '~> 0.1.5'
|
|
17
19
|
end
|
|
18
20
|
|
|
19
21
|
group :development do
|
|
@@ -25,8 +27,8 @@ group :development do
|
|
|
25
27
|
end
|
|
26
28
|
|
|
27
29
|
group :test do
|
|
28
|
-
gem 'rspec', '~> 2.
|
|
29
|
-
gem 'rr', '~> 1.
|
|
30
|
+
gem 'rspec', '~> 2.14.0'
|
|
31
|
+
gem 'rr', '~> 1.1.2'
|
|
30
32
|
|
|
31
33
|
if RUBY_VERSION >= "1.9"
|
|
32
34
|
gem 'rubyzip'
|
data/History.txt
CHANGED
data/README.md
CHANGED
|
@@ -439,17 +439,21 @@ The CLDR contains postal code validation regexes for a number of countries.
|
|
|
439
439
|
|
|
440
440
|
```ruby
|
|
441
441
|
# United States
|
|
442
|
-
TwitterCldr::Shared::PostalCodes.
|
|
443
|
-
|
|
442
|
+
postal_code = TwitterCldr::Shared::PostalCodes.for_territory(:us)
|
|
443
|
+
postal_code.valid?("94103") # true
|
|
444
|
+
postal_code.valid?("9410") # false
|
|
444
445
|
|
|
445
446
|
# England (Great Britain)
|
|
446
|
-
TwitterCldr::Shared::PostalCodes.
|
|
447
|
+
postal_code = TwitterCldr::Shared::PostalCodes.for_territory(:gb)
|
|
448
|
+
postal_code.valid?("BS98 1TL") # true
|
|
447
449
|
|
|
448
450
|
# Sweden
|
|
449
|
-
TwitterCldr::Shared::PostalCodes.
|
|
451
|
+
postal_code = TwitterCldr::Shared::PostalCodes.for_territory(:se)
|
|
452
|
+
postal_code.valid?("280 12") # true
|
|
450
453
|
|
|
451
454
|
# Canada
|
|
452
|
-
TwitterCldr::Shared::PostalCodes.
|
|
455
|
+
postal_code = TwitterCldr::Shared::PostalCodes.for_territory(:ca)
|
|
456
|
+
postal_code.valid?("V3H 1Z7") # true
|
|
453
457
|
```
|
|
454
458
|
|
|
455
459
|
Get a list of supported territories by using the `#territories` method:
|
|
@@ -461,7 +465,14 @@ TwitterCldr::Shared::PostalCodes.territories # [:ad, :am, :ar, :as, :at, ... ]
|
|
|
461
465
|
Just want the regex? No problem:
|
|
462
466
|
|
|
463
467
|
```ruby
|
|
464
|
-
TwitterCldr::Shared::PostalCodes.
|
|
468
|
+
postal_code = TwitterCldr::Shared::PostalCodes.for_territory(:us)
|
|
469
|
+
postal_code.regexp # /\d{5}([ \-]\d{4})?/
|
|
470
|
+
```
|
|
471
|
+
|
|
472
|
+
Get a sample of valid postal codes with the `#sample` method:
|
|
473
|
+
|
|
474
|
+
```ruby
|
|
475
|
+
postal_code.sample(5) # ["83526", "31748-8754", "55851", "25788-4914", "55335"]
|
|
465
476
|
```
|
|
466
477
|
|
|
467
478
|
### Phone Codes
|
|
@@ -35,7 +35,18 @@ module TwitterCldr
|
|
|
35
35
|
|
|
36
36
|
postal_codes = Hash[postal_codes.sort_by(&:first)]
|
|
37
37
|
|
|
38
|
-
|
|
38
|
+
postal_codes = postal_codes.each_with_object({}) do |(territory, regex), memo|
|
|
39
|
+
memo[territory] = {
|
|
40
|
+
:regex => regex,
|
|
41
|
+
:ast => TwitterCldr::Utils::RegexpAst.dump(
|
|
42
|
+
RegexpAstGenerator.generate(regex.source)
|
|
43
|
+
)
|
|
44
|
+
}
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
File.open(File.join(@output_path, 'postal_codes.yml'), 'w') do |output|
|
|
48
|
+
output.write(YAML.dump(postal_codes))
|
|
49
|
+
end
|
|
39
50
|
end
|
|
40
51
|
|
|
41
52
|
end
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# encoding: UTF-8
|
|
2
|
+
|
|
3
|
+
# Copyright 2012 Twitter, Inc
|
|
4
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
5
|
+
|
|
6
|
+
require 'regexp_parser'
|
|
7
|
+
|
|
8
|
+
module TwitterCldr
|
|
9
|
+
module Resources
|
|
10
|
+
|
|
11
|
+
class RegexpAstGenerator
|
|
12
|
+
class << self
|
|
13
|
+
|
|
14
|
+
def generate(regexp_str)
|
|
15
|
+
tree = Regexp::Parser.parse(regexp_str)
|
|
16
|
+
walk(tree)
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
private
|
|
20
|
+
|
|
21
|
+
def walk(node)
|
|
22
|
+
expressions = if node.respond_to?(:expressions)
|
|
23
|
+
node.expressions.map { |expr| walk(expr) }
|
|
24
|
+
else
|
|
25
|
+
[]
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
class_for(node).from_parser_node(node, expressions)
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def class_for(klass)
|
|
32
|
+
TwitterCldr::Utils::RegexpAst.const_get(
|
|
33
|
+
klass.class.to_s.split("::").last.to_sym
|
|
34
|
+
)
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
end
|
|
41
|
+
end
|
|
@@ -26,5 +26,6 @@ module TwitterCldr
|
|
|
26
26
|
autoload :RbnfTestImporter, 'twitter_cldr/resources/rbnf_test_importer'
|
|
27
27
|
autoload :ReadmeRenderer, 'twitter_cldr/resources/readme_renderer'
|
|
28
28
|
autoload :CasefolderClassGenerator, 'twitter_cldr/resources/casefolder_class_generator'
|
|
29
|
+
autoload :RegexpAstGenerator, 'twitter_cldr/resources/regexp_ast_generator'
|
|
29
30
|
end
|
|
30
31
|
end
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
# encoding: UTF-8
|
|
2
|
+
|
|
3
|
+
# Copyright 2012 Twitter, Inc
|
|
4
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
5
|
+
|
|
6
|
+
require 'set'
|
|
7
|
+
|
|
8
|
+
module TwitterCldr
|
|
9
|
+
module Shared
|
|
10
|
+
class PostalCodeGenerator
|
|
11
|
+
|
|
12
|
+
SAMPLE_MULTIPLIER = 4
|
|
13
|
+
|
|
14
|
+
def initialize(regexp_ast)
|
|
15
|
+
@regexp_generator = TwitterCldr::Utils::RegexpSampler.new(regexp_ast)
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def generate
|
|
19
|
+
clean_result(@regexp_generator.generate)
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def sample(sample_size = 1)
|
|
23
|
+
sample_set = Set.new
|
|
24
|
+
counter = 1
|
|
25
|
+
|
|
26
|
+
until sample_set.size == sample_size
|
|
27
|
+
sample = generate
|
|
28
|
+
sample_set << sample unless sample.empty?
|
|
29
|
+
counter += 1
|
|
30
|
+
|
|
31
|
+
# Stop if the number of attempted generations is
|
|
32
|
+
# n times more than requested. Some territories only
|
|
33
|
+
# have one postal code, so if the user asks for 10
|
|
34
|
+
# they'll get an infinite loop.
|
|
35
|
+
break if counter > sample_size * SAMPLE_MULTIPLIER
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
sample_set.to_a
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
private
|
|
42
|
+
|
|
43
|
+
# remove spaces that trail a dash
|
|
44
|
+
def clean_result(str)
|
|
45
|
+
str.gsub(/- /, '-')
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
end
|
|
@@ -5,7 +5,10 @@
|
|
|
5
5
|
|
|
6
6
|
module TwitterCldr
|
|
7
7
|
module Shared
|
|
8
|
-
|
|
8
|
+
|
|
9
|
+
class InvalidTerritoryError < StandardError; end
|
|
10
|
+
|
|
11
|
+
class PostalCodes
|
|
9
12
|
|
|
10
13
|
class << self
|
|
11
14
|
|
|
@@ -13,23 +16,59 @@ module TwitterCldr
|
|
|
13
16
|
resource.keys
|
|
14
17
|
end
|
|
15
18
|
|
|
16
|
-
def
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
19
|
+
def for_territory(territory)
|
|
20
|
+
key = territory.to_s.downcase.to_sym
|
|
21
|
+
if res = resource[key]
|
|
22
|
+
territory_cache[key] ||= begin
|
|
23
|
+
new(
|
|
24
|
+
territory,
|
|
25
|
+
res[:regex],
|
|
26
|
+
TwitterCldr::Utils::RegexpAst.load(res[:ast])
|
|
27
|
+
)
|
|
28
|
+
end
|
|
29
|
+
else
|
|
30
|
+
raise InvalidTerritoryError, "invalid territory"
|
|
31
|
+
end
|
|
23
32
|
end
|
|
24
33
|
|
|
25
34
|
private
|
|
26
35
|
|
|
36
|
+
def territory_cache
|
|
37
|
+
@territory_cache ||= {}
|
|
38
|
+
end
|
|
39
|
+
|
|
27
40
|
def resource
|
|
28
41
|
@resource ||= TwitterCldr.get_resource(:shared, :postal_codes)
|
|
29
42
|
end
|
|
30
43
|
|
|
31
44
|
end
|
|
32
45
|
|
|
46
|
+
attr_reader :territory, :regexp, :ast
|
|
47
|
+
|
|
48
|
+
def initialize(territory, regexp, ast)
|
|
49
|
+
@territory = territory
|
|
50
|
+
@regexp = regexp
|
|
51
|
+
@ast = ast
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def valid?(postal_code)
|
|
55
|
+
!!(regexp && regexp =~ postal_code)
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def sample(sample_size = 1)
|
|
59
|
+
generator.sample(sample_size)
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
private
|
|
63
|
+
|
|
64
|
+
def generator
|
|
65
|
+
generator_cache[territory] ||= PostalCodeGenerator.new(ast)
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
def generator_cache
|
|
69
|
+
@@generator_cache ||= {}
|
|
70
|
+
end
|
|
71
|
+
|
|
33
72
|
end
|
|
34
73
|
end
|
|
35
|
-
end
|
|
74
|
+
end
|
data/lib/twitter_cldr/shared.rb
CHANGED
|
@@ -5,19 +5,20 @@
|
|
|
5
5
|
|
|
6
6
|
module TwitterCldr
|
|
7
7
|
module Shared
|
|
8
|
-
autoload :Calendar,
|
|
9
|
-
autoload :CodePoint,
|
|
10
|
-
autoload :Currencies,
|
|
11
|
-
autoload :LanguageCodes,
|
|
12
|
-
autoload :Languages,
|
|
13
|
-
autoload :Numbers,
|
|
14
|
-
autoload :PhoneCodes,
|
|
15
|
-
autoload :PostalCodes,
|
|
16
|
-
autoload :
|
|
17
|
-
autoload :
|
|
18
|
-
autoload :
|
|
19
|
-
autoload :
|
|
20
|
-
autoload :
|
|
21
|
-
autoload :
|
|
8
|
+
autoload :Calendar, 'twitter_cldr/shared/calendar'
|
|
9
|
+
autoload :CodePoint, 'twitter_cldr/shared/code_point'
|
|
10
|
+
autoload :Currencies, 'twitter_cldr/shared/currencies'
|
|
11
|
+
autoload :LanguageCodes, 'twitter_cldr/shared/language_codes'
|
|
12
|
+
autoload :Languages, 'twitter_cldr/shared/languages'
|
|
13
|
+
autoload :Numbers, 'twitter_cldr/shared/numbers'
|
|
14
|
+
autoload :PhoneCodes, 'twitter_cldr/shared/phone_codes'
|
|
15
|
+
autoload :PostalCodes, 'twitter_cldr/shared/postal_codes'
|
|
16
|
+
autoload :PostalCodeGenerator, 'twitter_cldr/shared/postal_code_generator'
|
|
17
|
+
autoload :Bidi, 'twitter_cldr/shared/bidi'
|
|
18
|
+
autoload :Territories, 'twitter_cldr/shared/territories'
|
|
19
|
+
autoload :NumberingSystem, 'twitter_cldr/shared/numbering_system'
|
|
20
|
+
autoload :Casefolder, 'twitter_cldr/shared/casefolder'
|
|
21
|
+
autoload :UnicodeRegex, 'twitter_cldr/shared/unicode_regex'
|
|
22
|
+
autoload :BreakIterator, 'twitter_cldr/shared/break_iterator'
|
|
22
23
|
end
|
|
23
24
|
end
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
# encoding: UTF-8
|
|
2
|
+
|
|
3
|
+
# Copyright 2012 Twitter, Inc
|
|
4
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
5
|
+
|
|
6
|
+
require 'base64'
|
|
7
|
+
|
|
8
|
+
module TwitterCldr
|
|
9
|
+
module Utils
|
|
10
|
+
module RegexpAst
|
|
11
|
+
|
|
12
|
+
def self.load(ast_str)
|
|
13
|
+
Marshal.load(Base64.decode64(ast_str))
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def self.dump(ast)
|
|
17
|
+
Base64.encode64(Marshal.dump(ast))
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
class Node
|
|
21
|
+
attr_reader :expressions, :quantifier
|
|
22
|
+
|
|
23
|
+
def initialize(expressions, quantifier)
|
|
24
|
+
@expressions = expressions
|
|
25
|
+
@quantifier = quantifier
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def quantified?
|
|
29
|
+
!!quantifier
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def self.from_parser_node(node, expressions)
|
|
33
|
+
new(expressions, Quantifier.from_parser_node(node))
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
class CharacterSet < Node
|
|
38
|
+
attr_reader :members, :negated
|
|
39
|
+
alias :negated? :negated
|
|
40
|
+
|
|
41
|
+
def initialize(expressions, quantifier, members, negated)
|
|
42
|
+
@members = members; @negated = negated
|
|
43
|
+
super(expressions, quantifier)
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def self.from_parser_node(node, expressions)
|
|
47
|
+
new(
|
|
48
|
+
expressions, Quantifier.from_parser_node(node),
|
|
49
|
+
fix_members(node.members), node.negative?
|
|
50
|
+
)
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
private
|
|
54
|
+
|
|
55
|
+
# CLDR occasionally uses \d and other escapes in character classes
|
|
56
|
+
# to signify 0-9 and friends. This is legal regex syntax, but the
|
|
57
|
+
# regexp_parser gem doesn't handle it correctly, so we have to
|
|
58
|
+
# repair things here.
|
|
59
|
+
def self.fix_members(members)
|
|
60
|
+
members.join.scan(/(\\[wd]|\w-\w|\w|-)/).to_a.flatten.inject([]) do |ret, member|
|
|
61
|
+
case member
|
|
62
|
+
when '\d' then ret << '0-9'
|
|
63
|
+
when '\w' then ret += ['A-Z', 'a-z', '0-9', '_']
|
|
64
|
+
else ret << member
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
ret
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
class Literal < Node
|
|
73
|
+
attr_reader :text
|
|
74
|
+
|
|
75
|
+
def initialize(expressions, quantifier, text)
|
|
76
|
+
@text = text
|
|
77
|
+
super(expressions, quantifier)
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
def self.from_parser_node(node, expressions)
|
|
81
|
+
new(
|
|
82
|
+
expressions, Quantifier.from_parser_node(node), node.text
|
|
83
|
+
)
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
class Quantifier
|
|
88
|
+
attr_reader :max, :min
|
|
89
|
+
|
|
90
|
+
def initialize(max, min)
|
|
91
|
+
@max = max; @min = min
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
def self.from_parser_node(node)
|
|
95
|
+
if node.quantifier
|
|
96
|
+
new(
|
|
97
|
+
node.quantifier.max,
|
|
98
|
+
node.quantifier.min
|
|
99
|
+
)
|
|
100
|
+
end
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
class EscapeSequence < Literal; end
|
|
105
|
+
class Word < Node; end
|
|
106
|
+
class Digit < Node; end
|
|
107
|
+
class Sequence < Node; end
|
|
108
|
+
class Alternation < Node; end
|
|
109
|
+
class Capture < Node; end
|
|
110
|
+
class Passive < Node; end
|
|
111
|
+
class Root < Node; end
|
|
112
|
+
|
|
113
|
+
end
|
|
114
|
+
end
|
|
115
|
+
end
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
# encoding: UTF-8
|
|
2
|
+
|
|
3
|
+
# Copyright 2012 Twitter, Inc
|
|
4
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
5
|
+
|
|
6
|
+
module TwitterCldr
|
|
7
|
+
module Utils
|
|
8
|
+
|
|
9
|
+
# Generates a valid string that would match the given regexp ast.
|
|
10
|
+
class RegexpSampler
|
|
11
|
+
|
|
12
|
+
attr_reader :regexp_ast
|
|
13
|
+
|
|
14
|
+
DIGITS = ('0'..'9').to_a
|
|
15
|
+
WORD_LETTERS = ('a'..'z').to_a + ('A'..'Z').to_a + ['_']
|
|
16
|
+
|
|
17
|
+
def initialize(regexp_ast)
|
|
18
|
+
@regexp_ast = regexp_ast
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def generate
|
|
22
|
+
walk_children(regexp_ast)
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
private
|
|
26
|
+
|
|
27
|
+
def walk(node)
|
|
28
|
+
method = :"walk_#{class_name_for(node)}"
|
|
29
|
+
puts method unless respond_to?(method, true)
|
|
30
|
+
respond_to?(method, true) ? send(method, node) : ""
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def walk_children(node)
|
|
34
|
+
node.expressions.map { |expr| walk(expr) }.join
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def walk_digit(node)
|
|
38
|
+
if node.quantified?
|
|
39
|
+
quantifier_sample(DIGITS, node.quantifier)
|
|
40
|
+
else
|
|
41
|
+
[single_sample(DIGITS)]
|
|
42
|
+
end.join + walk_children(node)
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def walk_word(node)
|
|
46
|
+
if node.quantified?
|
|
47
|
+
quantifier_sample(WORD_LETTERS, node.quantifier)
|
|
48
|
+
else
|
|
49
|
+
[single_sample(WORD_LETTERS)]
|
|
50
|
+
end.join + walk_children(node)
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def walk_literal(node)
|
|
54
|
+
node.text * if node.quantified?
|
|
55
|
+
rand_in_quantifier(node.quantifier)
|
|
56
|
+
else
|
|
57
|
+
1
|
|
58
|
+
end + walk_children(node)
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def walk_character_set(node)
|
|
62
|
+
charset = expand_charset(node.members)
|
|
63
|
+
|
|
64
|
+
if node.quantified?
|
|
65
|
+
quantifier_sample(charset, node.quantifier)
|
|
66
|
+
else
|
|
67
|
+
[single_sample(charset)]
|
|
68
|
+
end.join + walk_children(node)
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def walk_capture(node)
|
|
72
|
+
if node.quantified?
|
|
73
|
+
rand_in_quantifier(node.quantifier).times.map do
|
|
74
|
+
walk_children(node)
|
|
75
|
+
end.join
|
|
76
|
+
else
|
|
77
|
+
walk_children(node)
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
# "passive" means non-capturing group.
|
|
82
|
+
# Since we don't need to distinguish between
|
|
83
|
+
# captures/non-captures, we can just delegate
|
|
84
|
+
# to the walk_capture method.
|
|
85
|
+
def walk_passive(node)
|
|
86
|
+
walk_capture(node)
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
def walk_alternation(node)
|
|
90
|
+
if node.quantified?
|
|
91
|
+
rand_in_quantifier(node.quantifier).times.map do
|
|
92
|
+
walk(single_sample(node.expressions))
|
|
93
|
+
end.join
|
|
94
|
+
else
|
|
95
|
+
walk(single_sample(node.expressions))
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
def walk_sequence(node)
|
|
100
|
+
if node.quantified?
|
|
101
|
+
rand_in_quantifier(node.quantifier).times.map do
|
|
102
|
+
node.expressions.map { |expr| walk(expr) }.join
|
|
103
|
+
end.join
|
|
104
|
+
else
|
|
105
|
+
node.expressions.map { |expr| walk(expr) }.join
|
|
106
|
+
end
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
def expand_charset(members)
|
|
110
|
+
members.inject([]) do |ret, member|
|
|
111
|
+
ret + expand_charset_member(member)
|
|
112
|
+
end
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
def expand_charset_member(member)
|
|
116
|
+
left, right = member.scan(/([^\\])-?/).flatten
|
|
117
|
+
right ? (left..right).to_a : [left]
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
def quantifier_sample(arr, quantifier)
|
|
121
|
+
sample_size = if quantifier.min == quantifier.max
|
|
122
|
+
quantifier.min
|
|
123
|
+
else
|
|
124
|
+
rand_in_quantifier(quantifier)
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
sample_size.times.map { single_sample(arr) }
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
def single_sample(arr)
|
|
131
|
+
arr[rand(arr.size)]
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
def rand_in_quantifier(quantifier)
|
|
135
|
+
rand_in_range(quantifier.min, quantifier.max)
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
def rand_in_range(min, max)
|
|
139
|
+
min + rand((max - min) + 1)
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
def class_name_for(node)
|
|
143
|
+
name = node.class.to_s.split("::").last
|
|
144
|
+
name.gsub(/\A|([A-Z])/) { $1 ? "_#{$1.downcase}" : "" }.downcase
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
end
|
|
148
|
+
end
|
|
149
|
+
end
|
data/lib/twitter_cldr/utils.rb
CHANGED
|
@@ -6,9 +6,11 @@
|
|
|
6
6
|
module TwitterCldr
|
|
7
7
|
module Utils
|
|
8
8
|
|
|
9
|
-
autoload :CodePoints,
|
|
10
|
-
autoload :YAML,
|
|
11
|
-
autoload :RangeSet,
|
|
9
|
+
autoload :CodePoints, 'twitter_cldr/utils/code_points'
|
|
10
|
+
autoload :YAML, 'twitter_cldr/utils/yaml'
|
|
11
|
+
autoload :RangeSet, 'twitter_cldr/utils/range_set'
|
|
12
|
+
autoload :RegexpAst, 'twitter_cldr/utils/regexp_ast'
|
|
13
|
+
autoload :RegexpSampler, 'twitter_cldr/utils/regexp_sampler'
|
|
12
14
|
|
|
13
15
|
class << self
|
|
14
16
|
|
data/lib/twitter_cldr/version.rb
CHANGED