prose 0.0.1 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (4) hide show
  1. checksums.yaml +13 -5
  2. data/lib/prose/prose.yaml +147 -141
  3. data/lib/prose.rb +29 -11
  4. metadata +7 -7
checksums.yaml CHANGED
@@ -1,7 +1,15 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 09b137dde0e95e9308ecdc067bf4fd562a9b1526
4
- data.tar.gz: 8cf9634b26218339d84eb677dfaea9432fdd9c36
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ Y2MzMzgzYzhmNjVkNjA1OWRmYjFkMWIxMDNhYmFiYTI0OGIzZThhMg==
5
+ data.tar.gz: !binary |-
6
+ YjM1NjNhYmRkYjM2YzdhNTU2OTFlYzcyZjc5ZTg1MDQ3OThlMjAzNQ==
5
7
  SHA512:
6
- metadata.gz: be87ec95d9bc12c36caa5d69d2192f88b9f02ed041f97fb186142c71e5a9d1f371d9ec60980ee520f8e928339182706e467aafbdbec6112c2cce433e20f74c03
7
- data.tar.gz: 220d5d89073e35e652d91e89d6a71845166d5679c5ac35777af0f80713c56d727be5a6516eb33c89515a2c095142c0945f22282b61b1fb04cb20806a4af15197
8
+ metadata.gz: !binary |-
9
+ NDMyM2MyMDgxOTgzNWUwMmRkOWRkZWI1N2YyMGFmNDkwNTk2NmVhNTIyNzM3
10
+ N2U2ZjZiZDE4Njk1YmFjZmQ2MGY3ZWIzMTA5OTdhMDg3ZGE2ZmI4OTdmZDcx
11
+ MWY3ZDY5Mzg2MWYxOTk5MzcxNzQwMWQzMmFhNmQxZjgwNzEzZTk=
12
+ data.tar.gz: !binary |-
13
+ YWU2YTdjOWZkMWFjODNiZmRlMjRlMWMxZWIwY2ExZGQ3MTQ1Y2RhYmZhNGNl
14
+ ODhmNDZjMDI4OWU2YTJlMTVjN2I1MWUzZmNlNzdhMGZmZWFmYjVlYzRiNjM3
15
+ ZWRiMDQyMWY3MGNlZDNiYzNkMzc0YjdlZDk3MGI2NGE4MTZmNDk=
data/lib/prose/prose.yaml CHANGED
@@ -1,141 +1,147 @@
1
- 0590-05FF: hebrew
2
- FB00–FB4F: hebrew
3
- 00D00-0D7F: malayalam
4
- 0530-058F: armenian
5
- 2C80-2CFF: coptic
6
- 10800-1083F: cypriot
7
- 0400-04FF: cyrillic
8
- 0500-052F: cyrillic
9
- 2DE0-2DFF: cyrillic
10
- A640-A69F: cyrillic
11
- 10A0-10FF: georgian
12
- 2D00-2D2F: georgian
13
- 2C00-2C5F: glagolithic
14
- 10330-1034F: gothic
15
- 0370-03FF: greek
16
- 1F00-1FFF: greek
17
- 0000-007F: latin
18
- 0080-00FF: latin
19
- 0100-017F: latin
20
- 0180-024F: latin
21
- 2C60-2C7F: latin
22
- A720-A7FF: latin
23
- 1E00-1EFF: latin
24
- FB00-FB4F: latin
25
- FB00-FB4F: latin
26
- FF00-FFEF: latin
27
- 1680-169F: ogham
28
- 10300-1032F: old_italics
29
- 101D0-101FF: phaistos
30
- 16A0-16FF: runic
31
- 10450-1047F: shavian
32
- A6A0-A6FF: bamum
33
- 16800-16A3F: bamum
34
- 13000-1342F: egyptian_hieroglyphs
35
- 1200-137F: ethiopic
36
- 1380-139F: ethiopic
37
- 2D80-2DDF: ethiopic
38
- AB00-AB2F: ethiopic
39
- 109A0-109FF: meroitic_cursive
40
- 10980-1099F: meroitic_hieroglyphs
41
- 07C0-07FF: nko
42
- 10480-104AF: osmanya
43
- 2D30-2D7F: tifinagh
44
- A500-A63F: vai
45
- 0600-06FF: arabic
46
- 0750-077F: arabic
47
- 08A0-08FF: arabic
48
- FB50-FDFF: arabic
49
- FE70-FEFF: arabic
50
- 10840-1085F: aramic
51
- 10B00-10B3F: avestan
52
- 102A0-102DF: carian
53
- 12000-123FF: cuniform
54
- 12400-1247F: cuniform_numbers_punctuation
55
- 10280-1029F: lycian
56
- 1800-18AF: mongolian
57
- 0F00-0FFF: tibetan
58
- 0980-09FF: bengali_assamese
59
- 0A80-0AFF: gujarati
60
- 0C80-0CFF: kannada
61
- 0B00-0B7F: oriya
62
- 0B80-0BFF: tamil
63
- 0C00-0C7F: telugu
64
- 11000-1107F: brahmi
65
- 0900-097F: devanagari
66
- A8E0-A8FF: devanagari
67
- 103A0-103DF: old_persian
68
- 10380-1039F: ugaritic
69
- 10920-1093F: lydian
70
- 0840-085F: mandaic
71
- 10A60-10A7F: old_south_arabian
72
- 10B60-10B7F: pahlavi
73
- 10B40-10B5F: parthian
74
- 10900-1091F: phoenician
75
- 0800-083F: samaritan
76
- 0700-074F: syriac
77
- 10C00-10C4F: old_turkic
78
- A840-A87F: phags_pa
79
- 11100-1114F: chakma
80
- 0A00-0A7F: gurmukhi
81
- 11080-110CF: kaithi
82
- 10A00-10A5F: kharoshthi
83
- 1C00-1C4F: lepcha
84
- 1900-194F: limbu
85
- ABC0-ABFF: meetei_mayek
86
- AAE0-AAFF: meetei_mayek
87
- 1C50-1C7F: ol_chiki
88
- A880-A8DF: saurashtra
89
- 11180-111DF: sharada
90
- 0D80-0DFF: sinhala
91
- 110D0-110FF: sora_sompeng
92
- A800-A82F: syloti_nagri
93
- 11680-116CF: takri
94
- 0780-07BF: thaana
95
- 1CD0-1CFF: vedic
96
- 1B00-1B7F: balinese
97
- 1BC0-1BFF: batak
98
- 1A00-1A1F: buginese
99
- AA00-AA5F: cham
100
- A980-A9DF: javanese
101
- A900-A92F: kayah_li
102
- 1780-17FF: khmer
103
- 19E0-19FF: khmer
104
- 0E80-0EFF: lao
105
- 1000-109F: myanmar
106
- AA60-AA7F: myanmar
107
- 1980-19DF: new_tai_lue
108
- A930-A95F: rejang
109
- 1B80-1BBF: sudanese
110
- 1CC0-1CCF: sudanese
111
- 1950-197F: tai_le
112
- 1A20-1AAF: tai_tham
113
- AA80-AADF: tai_viet
114
- 0E00-0E7F: thai
115
- 1740-175F: buhid
116
- 1720-173F: hanunoo
117
- 1700-171F: tagalog
118
- 1760-177F: tagbanwa
119
- 3100-312F: bopomofo
120
- 31A0-31BF: bopomofo
121
- 1100-11FF: hangul_jamo
122
- A960-A97F: hangul_jamo
123
- D7B0-D7FF: hangul_jamo
124
- 3130-318F: hangul_jamo
125
- FF00-FFEF: hangul_jamo
126
- AC00-D7AF: hangul
127
- 3040-309F: hiragana
128
- 30A0-30FF: katakana
129
- 31F0-31FF: katakana
130
- FF00-FFEF: katakana
131
- 1B000-1B0FF: kana
132
- 3190-319F: kanbun
133
- A4D0-A4FF: lisu
134
- 16F00-16F9F: miao
135
- A000-A48F: yi
136
- A490-A4CF: yi
137
- 13A0-13FF: cherokee
138
- 10400-1044F: deseret
139
- 1400-167F: united_canadian_aborginal
140
- 18B0-18FF: united_canadian_aborginal
141
- #0000-007F: ASCII
1
+ # ranges:
2
+ 0590-05FF: "hebrew-1"
3
+ FB00–FB4F: "hebrew-2"
4
+ 00D00-0D7F: malayalam
5
+ 0530-058F: armenian
6
+ 2C80-2CFF: coptic
7
+ 10800-1083F: cypriot
8
+ 0400-04FF: "cyrillic-1"
9
+ 0500-052F: "cyrillic-2"
10
+ 2DE0-2DFF: "cyrillic-3"
11
+ A640-A69F: "cyrillic-4"
12
+ 10A0-10FF: "georgian-1"
13
+ 2D00-2D2F: "georgian-2"
14
+ 2C00-2C5F: glagolithic
15
+ 10330-1034F: gothic
16
+ 0370-03FF: "greek-1"
17
+ 1F00-1FFF: "greek-2"
18
+ 0000-007F: "latin-1"
19
+ 0080-00FF: "latin-2"
20
+ 0100-017F: "latin-3"
21
+ 0180-024F: "latin-4"
22
+ 2C60-2C7F: "latin-5"
23
+ A720-A7FF: "latin-6"
24
+ 1E00-1EFF: "latin-7"
25
+ FB00-FB4F: "latin-8"
26
+ FB00-FB4F: "latin-9"
27
+ FF00-FFEF: "latin-10"
28
+ 1680-169F: ogham
29
+ 10300-1032F: old_italics
30
+ 101D0-101FF: phaistos
31
+ 16A0-16FF: runic
32
+ 10450-1047F: shavian
33
+ A6A0-A6FF: bamum
34
+ 16800-16A3F: bamum
35
+ 13000-1342F: egyptian_hieroglyphs
36
+ 1200-137F: "ethiopic-1"
37
+ 1380-139F: "ethiopic-2"
38
+ 2D80-2DDF: "ethiopic-3"
39
+ AB00-AB2F: ethiopic
40
+ 109A0-109FF: meroitic_cursive
41
+ 10980-1099F: meroitic_hieroglyphs
42
+ 07C0-07FF: nko
43
+ 10480-104AF: osmanya
44
+ 2D30-2D7F: tifinagh
45
+ A500-A63F: vai
46
+ 0600-06FF: "arabic-1"
47
+ 0750-077F: "arabic-2"
48
+ 08A0-08FF: "arabic-3"
49
+ FB50-FDFF: "arabic-4"
50
+ FE70-FEFF: "arabic-5"
51
+ 10840-1085F: aramic
52
+ 10B00-10B3F: avestan
53
+ 102A0-102DF: carian
54
+ 12000-123FF: cuniform
55
+ 12400-1247F: cuniform_numbers_punctuation
56
+ 10280-1029F: lycian
57
+ 1800-18AF: mongolian
58
+ 0F00-0FFF: tibetan
59
+ 0980-09FF: bengali_assamese
60
+ 0A80-0AFF: gujarati
61
+ 0C80-0CFF: kannada
62
+ 0B00-0B7F: oriya
63
+ 0B80-0BFF: tamil
64
+ 0C00-0C7F: telugu
65
+ 11000-1107F: brahmi
66
+ 0900-097F: "devanagari-1"
67
+ A8E0-A8FF: "devanagari-2"
68
+ 103A0-103DF: old_persian
69
+ 10380-1039F: ugaritic
70
+ 10920-1093F: lydian
71
+ 0840-085F: mandaic
72
+ 10A60-10A7F: old_south_arabian
73
+ 10B60-10B7F: pahlavi
74
+ 10B40-10B5F: parthian
75
+ 10900-1091F: phoenician
76
+ 0800-083F: samaritan
77
+ 0700-074F: syriac
78
+ 10C00-10C4F: old_turkic
79
+ A840-A87F: phags_pa
80
+ 11100-1114F: chakma
81
+ 0A00-0A7F: gurmukhi
82
+ 11080-110CF: kaithi
83
+ 10A00-10A5F: kharoshthi
84
+ 1C00-1C4F: lepcha
85
+ 1900-194F: limbu
86
+ ABC0-ABFF: "meetei_mayek-1"
87
+ AAE0-AAFF: "meetei_mayek-2"
88
+ 1C50-1C7F: ol_chiki
89
+ A880-A8DF: saurashtra
90
+ 11180-111DF: sharada
91
+ 0D80-0DFF: sinhala
92
+ 110D0-110FF: sora_sompeng
93
+ A800-A82F: syloti_nagri
94
+ 11680-116CF: takri
95
+ 0780-07BF: thaana
96
+ 1CD0-1CFF: vedic
97
+ 1B00-1B7F: balinese
98
+ 1BC0-1BFF: batak
99
+ 1A00-1A1F: buginese
100
+ AA00-AA5F: cham
101
+ A980-A9DF: javanese
102
+ A900-A92F: kayah_li
103
+ 1780-17FF: "khmer-1"
104
+ 19E0-19FF: "khmer-2"
105
+ 0E80-0EFF: lao
106
+ 1000-109F: "myanmar-1"
107
+ AA60-AA7F: "myanmar-2"
108
+ 1980-19DF: new_tai_lue
109
+ A930-A95F: rejang
110
+ 1B80-1BBF: "sudanese-1"
111
+ 1CC0-1CCF: "sudanese-2"
112
+ 1950-197F: tai_le
113
+ 1A20-1AAF: tai_tham
114
+ AA80-AADF: tai_viet
115
+ 0E00-0E7F: thai
116
+ 1740-175F: buhid
117
+ 1720-173F: hanunoo
118
+ 1700-171F: tagalog
119
+ 1760-177F: tagbanwa
120
+ 3100-312F: "bopomofo-1"
121
+ 31A0-31BF: "bopomofo-2"
122
+ 1100-11FF: "hangul_jamo-1"
123
+ A960-A97F: "hangul_jamo-2"
124
+ D7B0-D7FF: "hangul_jamo-3"
125
+ 3130-318F: "hangul_jamo-4"
126
+ FF00-FFEF: "hangul_jamo-5"
127
+ AC00-D7AF: hangul
128
+ 3040-309F: hiragana
129
+ 30A0-30FF: "katakana-1"
130
+ 31F0-31FF: "katakana-2"
131
+ FF00-FFEF: "katakana-3"
132
+ 1B000-1B0FF: kana
133
+ 3190-319F: kanbun
134
+ A4D0-A4FF: lisu
135
+ 16F00-16F9F: miao
136
+ A000-A48F: yi
137
+ A490-A4CF: yi
138
+ 13A0-13FF: cherokee
139
+ 10400-1044F: deseret
140
+ 1400-167F: "united_canadian_aborginal-1"
141
+ 18B0-18FF: "united_canadian_aborginal-2"
142
+ #0000-007F: ASCII
143
+
144
+ # languages:
145
+ # #Future. if there is any
146
+ # hebrew:
147
+ # - hebrew
data/lib/prose.rb CHANGED
@@ -1,25 +1,43 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  require 'yaml'
3
+ require 'pry'
3
4
 
4
5
  class String
5
6
 
6
- def prose?
7
- find_languages_in(self)
8
- end
7
+ RANGES ||= YAML::load( File.open( "#{File.expand_path File.dirname(__FILE__)}/prose/prose.yaml" ) )
8
+ LANGUAGES ||= RANGES.invert
9
9
 
10
- private
10
+ def prose
11
+ find_languages_in(self) # rename find_origin_of
12
+ end
11
13
 
14
+ # define_method "#{language}?" do
15
+ # self.is_language?
16
+ # end
17
+
18
+ # __method__ cannot individually identify each method defined dynamically with define_method
19
+ # Since this clumsy fix
20
+ LANGUAGES.keys.each do |language|
21
+ eval <<-EOM
22
+ def #{language.split('-').first}?(pure = false)
23
+ language = __method__.to_s.gsub("?", "")
24
+ result = find_languages_in(self)
25
+ pure ? ((result - [language]).empty?) : (result.include? language)
26
+ end
27
+ EOM
28
+ end
12
29
 
13
- def unicode_ranges
14
- @ranges ||= YAML::load( File.open( "#{File.expand_path File.dirname(__FILE__)}/prose/prose.yaml" ) )
30
+ def language_of ordinal, min_range, max_range
31
+ (min_range.to_i(16) < ordinal) and (max_range.to_i(16) > ordinal)
15
32
  end
16
33
 
17
- def language_of letter
34
+ def languages_of letter
18
35
  result = []
19
36
  int_ordinal = letter.ord
20
- unicode_ranges.keys.each do |key|
37
+ RANGES.keys.each do |key|
21
38
  min, max = key.split("-")
22
- result << unicode_ranges[key] if (min.to_i(16) < int_ordinal) and (max.to_i(16) > int_ordinal)
39
+ ordinal_in_range = language_of(int_ordinal, min, max)
40
+ result << RANGES[key].split("-").first if ordinal_in_range #language_of(int_ordinal, min, max) #(min.to_i(16) < int_ordinal) and (max.to_i(16) > int_ordinal)
23
41
  end
24
42
  return result
25
43
  end
@@ -27,9 +45,9 @@ class String
27
45
  def find_languages_in word
28
46
  result = []
29
47
  word.split('').each do |letter|
30
- result += language_of(letter) if not letter == " "
48
+ result += languages_of(letter) if (letter != " ")
31
49
  end
32
50
  return result.uniq
33
51
  end
34
52
 
35
- end
53
+ end
metadata CHANGED
@@ -1,16 +1,16 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: prose
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Edwin Rozario
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-02-01 00:00:00.000000000 Z
11
+ date: 2014-08-18 00:00:00.000000000 Z
12
12
  dependencies: []
13
- description: Identifies language of alphabets in a string
13
+ description: Language detector
14
14
  email:
15
15
  - rozarioed@gmail.com
16
16
  executables: []
@@ -28,18 +28,18 @@ require_paths:
28
28
  - lib
29
29
  required_ruby_version: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - '>='
31
+ - - ! '>='
32
32
  - !ruby/object:Gem::Version
33
33
  version: '0'
34
34
  required_rubygems_version: !ruby/object:Gem::Requirement
35
35
  requirements:
36
- - - '>='
36
+ - - ! '>='
37
37
  - !ruby/object:Gem::Version
38
38
  version: '0'
39
39
  requirements: []
40
40
  rubyforge_project:
41
- rubygems_version: 2.2.0
41
+ rubygems_version: 2.4.1
42
42
  signing_key:
43
43
  specification_version: 4
44
- summary: Identify language string
44
+ summary: Language detector
45
45
  test_files: []