sanscript 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.codeclimate.yml +9 -0
- data/.gitignore +9 -0
- data/.rspec +2 -0
- data/.rubocop.yml +97 -0
- data/.travis.yml +9 -0
- data/CODE_OF_CONDUCT.md +49 -0
- data/Gemfile +5 -0
- data/LICENSE.txt +22 -0
- data/README.md +43 -0
- data/Rakefile +7 -0
- data/bin/console +12 -0
- data/bin/setup +8 -0
- data/lib/sanscript.rb +29 -0
- data/lib/sanscript/benchmark.rb +53 -0
- data/lib/sanscript/detect.rb +77 -0
- data/lib/sanscript/refinements.rb +94 -0
- data/lib/sanscript/transliterate.rb +343 -0
- data/lib/sanscript/transliterate/schemes.rb +312 -0
- data/lib/sanscript/version.rb +4 -0
- data/sanscript.gemspec +29 -0
- metadata +148 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: d94fb9c3290ec64af941b806bb8cd78f0b66b442
|
4
|
+
data.tar.gz: 3e298a3b363a89081fbf603247f2b49a71144b8a
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 8ed2a31fa2f140f4e0085638996cbf31693735d07348fb367505fa104a06a1d22834f18ac3cbc0696079ff0503729b0b64192c006ccce1945ad3de5737d8aef3
|
7
|
+
data.tar.gz: fe535ef6247b9d91ab23643566a2d9e86d7c144cdccf66f3ce6f8b18ba49830813c8d9c20e27c10fcb536c480cee25d504119bf2ba7f04d2676b49f59d4135d4
|
data/.codeclimate.yml
ADDED
data/.gitignore
ADDED
data/.rspec
ADDED
data/.rubocop.yml
ADDED
@@ -0,0 +1,97 @@
|
|
1
|
+
AllCops:
|
2
|
+
TargetRubyVersion: 2.3
|
3
|
+
|
4
|
+
# Metrics
|
5
|
+
Metrics/AbcSize:
|
6
|
+
Description: >-
|
7
|
+
A calculated magnitude based on number of assignments,
|
8
|
+
branches, and conditions.
|
9
|
+
Reference: 'http://c2.com/cgi/wiki?AbcMetric'
|
10
|
+
Enabled: false
|
11
|
+
Max: 20
|
12
|
+
|
13
|
+
Metrics/BlockNesting:
|
14
|
+
Description: 'Avoid excessive block nesting'
|
15
|
+
StyleGuide: 'https://github.com/bbatsov/ruby-style-guide#three-is-the-number-thou-shalt-count'
|
16
|
+
Enabled: true
|
17
|
+
Max: 4
|
18
|
+
|
19
|
+
Metrics/ClassLength:
|
20
|
+
Description: 'Avoid classes longer than 250 lines of code.'
|
21
|
+
Enabled: true
|
22
|
+
Max: 250
|
23
|
+
|
24
|
+
Metrics/CyclomaticComplexity:
|
25
|
+
Description: >-
|
26
|
+
A complexity metric that is strongly correlated to the number
|
27
|
+
of test cases needed to validate a method.
|
28
|
+
Enabled: true
|
29
|
+
Max: 10
|
30
|
+
|
31
|
+
Metrics/LineLength:
|
32
|
+
Description: 'Limit lines to 80 characters.'
|
33
|
+
StyleGuide: 'https://github.com/bbatsov/ruby-style-guide#80-character-limits'
|
34
|
+
Enabled: false
|
35
|
+
|
36
|
+
Metrics/MethodLength:
|
37
|
+
Description: 'Avoid methods longer than 40 lines of code.'
|
38
|
+
Enabled: true
|
39
|
+
Max: 40
|
40
|
+
|
41
|
+
Metrics/ModuleLength:
|
42
|
+
Description: 'Avoid modules longer than 250 lines of code.'
|
43
|
+
Enabled: true
|
44
|
+
Max: 250
|
45
|
+
|
46
|
+
Metrics/ParameterLists:
|
47
|
+
Description: 'Avoid parameter lists longer than three or four parameters.'
|
48
|
+
StyleGuide: 'https://github.com/bbatsov/ruby-style-guide#too-many-params'
|
49
|
+
Enabled: true
|
50
|
+
|
51
|
+
Metrics/PerceivedComplexity:
|
52
|
+
Description: >-
|
53
|
+
A complexity metric geared towards measuring complexity for a
|
54
|
+
human reader.
|
55
|
+
Enabled: false
|
56
|
+
|
57
|
+
# Style
|
58
|
+
Style/AsciiComments:
|
59
|
+
Enabled: false
|
60
|
+
|
61
|
+
Style/CollectionMethods:
|
62
|
+
Enabled: true
|
63
|
+
|
64
|
+
Style/Documentation:
|
65
|
+
Enabled: false
|
66
|
+
|
67
|
+
Style/EmptyLiteral:
|
68
|
+
Enabled: false
|
69
|
+
|
70
|
+
Style/FormatString:
|
71
|
+
EnforcedStyle: percent
|
72
|
+
|
73
|
+
Style/HashSyntax:
|
74
|
+
EnforcedStyle: ruby19_no_mixed_keys
|
75
|
+
|
76
|
+
Style/MethodCalledOnDoEndBlock:
|
77
|
+
Enabled: true
|
78
|
+
|
79
|
+
Style/PercentLiteralDelimiters:
|
80
|
+
PreferredDelimiters:
|
81
|
+
'%i': '[]'
|
82
|
+
'%I': '[]'
|
83
|
+
'%w': '[]'
|
84
|
+
'%W': '[]'
|
85
|
+
|
86
|
+
Style/StringLiterals:
|
87
|
+
EnforcedStyle: double_quotes
|
88
|
+
|
89
|
+
Style/SymbolArray:
|
90
|
+
Enabled: true
|
91
|
+
|
92
|
+
Style/TrailingCommaInLiteral:
|
93
|
+
EnforcedStyleForMultiline: comma
|
94
|
+
|
95
|
+
Style/TrivialAccessors:
|
96
|
+
ExactNameMatch: true
|
97
|
+
AllowPredicates: true
|
data/.travis.yml
ADDED
data/CODE_OF_CONDUCT.md
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
# Contributor Code of Conduct
|
2
|
+
|
3
|
+
As contributors and maintainers of this project, and in the interest of
|
4
|
+
fostering an open and welcoming community, we pledge to respect all people who
|
5
|
+
contribute through reporting issues, posting feature requests, updating
|
6
|
+
documentation, submitting pull requests or patches, and other activities.
|
7
|
+
|
8
|
+
We are committed to making participation in this project a harassment-free
|
9
|
+
experience for everyone, regardless of level of experience, gender, gender
|
10
|
+
identity and expression, sexual orientation, disability, personal appearance,
|
11
|
+
body size, race, ethnicity, age, religion, or nationality.
|
12
|
+
|
13
|
+
Examples of unacceptable behavior by participants include:
|
14
|
+
|
15
|
+
* The use of sexualized language or imagery
|
16
|
+
* Personal attacks
|
17
|
+
* Trolling or insulting/derogatory comments
|
18
|
+
* Public or private harassment
|
19
|
+
* Publishing other's private information, such as physical or electronic
|
20
|
+
addresses, without explicit permission
|
21
|
+
* Other unethical or unprofessional conduct
|
22
|
+
|
23
|
+
Project maintainers have the right and responsibility to remove, edit, or
|
24
|
+
reject comments, commits, code, wiki edits, issues, and other contributions
|
25
|
+
that are not aligned to this Code of Conduct, or to ban temporarily or
|
26
|
+
permanently any contributor for other behaviors that they deem inappropriate,
|
27
|
+
threatening, offensive, or harmful.
|
28
|
+
|
29
|
+
By adopting this Code of Conduct, project maintainers commit themselves to
|
30
|
+
fairly and consistently applying these principles to every aspect of managing
|
31
|
+
this project. Project maintainers who do not follow or enforce the Code of
|
32
|
+
Conduct may be permanently removed from the project team.
|
33
|
+
|
34
|
+
This code of conduct applies both within project spaces and in public spaces
|
35
|
+
when an individual is representing the project or its community.
|
36
|
+
|
37
|
+
Instances of abusive, harassing, or otherwise unacceptable behavior may be
|
38
|
+
reported by contacting a project maintainer at nomoon@phoebus.ca. All
|
39
|
+
complaints will be reviewed and investigated and will result in a response that
|
40
|
+
is deemed necessary and appropriate to the circumstances. Maintainers are
|
41
|
+
obligated to maintain confidentiality with regard to the reporter of an
|
42
|
+
incident.
|
43
|
+
|
44
|
+
This Code of Conduct is adapted from the [Contributor Covenant][homepage],
|
45
|
+
version 1.3.0, available at
|
46
|
+
[http://contributor-covenant.org/version/1/3/0/][version]
|
47
|
+
|
48
|
+
[homepage]: http://contributor-covenant.org
|
49
|
+
[version]: http://contributor-covenant.org/version/1/3/0/
|
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2013 learnsanskrit.org (original Javascript)
|
4
|
+
Copyright (c) 2016 ported by Tim Bellefleur (Ruby port)
|
5
|
+
|
6
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
7
|
+
of this software and associated documentation files (the "Software"), to deal
|
8
|
+
in the Software without restriction, including without limitation the rights
|
9
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10
|
+
copies of the Software, and to permit persons to whom the Software is
|
11
|
+
furnished to do so, subject to the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be included in
|
14
|
+
all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
17
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
18
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
19
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
20
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
21
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
22
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
# Sanscript.rb
|
2
|
+
|
3
|
+
[![Build Status](https://travis-ci.org/ubcsanskrit/sanscript.rb.svg?branch=master)](https://travis-ci.org/ubcsanskrit/sanscript.rb)
|
4
|
+
[![Code Climate](https://codeclimate.com/github/ubcsanskrit/sanscript.rb/badges/gpa.svg)](https://codeclimate.com/github/ubcsanskrit/sanscript.rb)
|
5
|
+
[![Test Coverage](https://codeclimate.com/github/ubcsanskrit/sanscript.rb/badges/coverage.svg)](https://codeclimate.com/github/ubcsanskrit/sanscript.rb/coverage)
|
6
|
+
[![Dependency Status](https://gemnasium.com/badges/github.com/ubcsanskrit/sanscript.rb.svg)](https://gemnasium.com/github.com/ubcsanskrit/sanscript.rb)
|
7
|
+
|
8
|
+
This gem is starting off as a mostly-straightforward port of [learnsanskrit.org's Sanscript.js](https://github.com/sanskrit/sanscript.js), and will go from there. It also incorporates transliteration scheme detection based on [learnsanskrit.org's Detect.js](https://github.com/sanskrit/detect.js).
|
9
|
+
|
10
|
+
## Installation
|
11
|
+
|
12
|
+
Add this line to your application's Gemfile:
|
13
|
+
|
14
|
+
```ruby
|
15
|
+
gem 'sanscript'
|
16
|
+
```
|
17
|
+
|
18
|
+
And then execute:
|
19
|
+
|
20
|
+
$ bundle
|
21
|
+
|
22
|
+
Or install it yourself as:
|
23
|
+
|
24
|
+
$ gem install sanscript
|
25
|
+
|
26
|
+
## Usage
|
27
|
+
|
28
|
+
You can access detection through `Sanscript.detect(text)` and transliteration through `Sanscript.transliterate(text, from, to)`. Code should be fairly straightforward and partially documented.
|
29
|
+
|
30
|
+
## Development
|
31
|
+
|
32
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
33
|
+
|
34
|
+
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
35
|
+
|
36
|
+
## Contributing
|
37
|
+
|
38
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/ubcsanskrit/sanscript. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](http://contributor-covenant.org) code of conduct.
|
39
|
+
|
40
|
+
|
41
|
+
## License
|
42
|
+
|
43
|
+
The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
|
data/Rakefile
ADDED
data/bin/console
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
require "bundler/setup"
|
5
|
+
require "sanscript"
|
6
|
+
|
7
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
8
|
+
# with your gem easier. You can also use a different console, if you like.
|
9
|
+
|
10
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
11
|
+
require "pry"
|
12
|
+
Pry.start
|
data/bin/setup
ADDED
data/lib/sanscript.rb
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
require "sanscript/version"
|
3
|
+
require "sanscript/detect"
|
4
|
+
require "sanscript/transliterate"
|
5
|
+
require "sanscript/benchmark"
|
6
|
+
|
7
|
+
module Sanscript
|
8
|
+
module_function
|
9
|
+
|
10
|
+
# Proxies the Detect.detect_script method
|
11
|
+
#
|
12
|
+
def detect(text)
|
13
|
+
Detect.detect_script(text)
|
14
|
+
end
|
15
|
+
|
16
|
+
# The transliterate method accepts multiple signatures
|
17
|
+
# .transliterate(text, to) will auto-detect the source script
|
18
|
+
# .transliterate(text, to, from) will specify the source and target script
|
19
|
+
#
|
20
|
+
# Final Hash arguments are passed along as options.
|
21
|
+
#
|
22
|
+
def transliterate(text, first, second = nil, **options)
|
23
|
+
if second.nil?
|
24
|
+
second = first
|
25
|
+
first = Detect.detect_script(text)
|
26
|
+
end
|
27
|
+
Transliterate.transliterate(text, first, second, options)
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "sanscript/refinements"
|
4
|
+
require "benchmark"
|
5
|
+
|
6
|
+
module Sanscript
|
7
|
+
using Refinements
|
8
|
+
module Benchmark
|
9
|
+
module_function
|
10
|
+
|
11
|
+
def detection!
|
12
|
+
n = 100_000
|
13
|
+
iast_string = "nānāśāstrasubhāṣitāmṛtarasaiḥ śrotrotsavaṃ kurvatāṃ yeṣāṃ yānti dināni paṇḍitajanavyāyāmakhinnātmanām teṣāṃ janma ca jīvitaṃ ca sukṛtaṃ tair eva bhūr bhūṣitā śeṣaih kiṃ paśuvad vivekarahitair bhūbhārabhūtair naraḥ"
|
14
|
+
deva_string = "नानाशास्त्रसुभाषितामृतरसैः श्रोत्रोत्सवं कुर्वतां येषां यान्ति दिनानि पण्डितजनव्यायामखिन्नात्मनाम् तेषां जन्म च जीवितं च सुकृतं तैर् एव भूर् भूषिता शेषैह् किं पशुवद् विवेकरहितैर् भूभारभूतैर् नरः"
|
15
|
+
|
16
|
+
::Benchmark.bmbm(18) do |x|
|
17
|
+
x.report("Detect IAST") do
|
18
|
+
n.times { raise unless Sanscript.detect(iast_string) == :iast }
|
19
|
+
end
|
20
|
+
x.report("Detect Devanagari") do
|
21
|
+
n.times { raise unless Sanscript.detect(deva_string) == :devanagari }
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def transliteration!
|
27
|
+
n = 5_000
|
28
|
+
iast_string = "nānāśāstrasubhāṣitāmṛtarasaiḥ śrotrotsavaṃ kurvatāṃ yeṣāṃ yānti dināni paṇḍitajanavyāyāmakhinnātmanām teṣāṃ janma ca jīvitaṃ ca sukṛtaṃ tair eva bhūr bhūṣitā śeṣaih kiṃ paśuvad vivekarahitair bhūbhārabhūtair naraḥ"
|
29
|
+
|
30
|
+
deva_string = "नानाशास्त्रसुभाषितामृतरसैः श्रोत्रोत्सवं कुर्वतां येषां यान्ति दिनानि पण्डितजनव्यायामखिन्नात्मनाम् तेषां जन्म च जीवितं च सुकृतं तैर् एव भूर् भूषिता शेषैह् किं पशुवद् विवेकरहितैर् भूभारभूतैर् नरः"
|
31
|
+
::Benchmark.bmbm(18) do |x|
|
32
|
+
x.report("IAST**>Devanagari") do
|
33
|
+
n.times { Sanscript.transliterate(iast_string, :devanagari) }
|
34
|
+
end
|
35
|
+
x.report("IAST==>Devanagari") do
|
36
|
+
n.times { Sanscript.transliterate(iast_string, :iast, :devanagari) }
|
37
|
+
end
|
38
|
+
x.report("IAST**>SLP1") do
|
39
|
+
n.times { Sanscript.transliterate(iast_string, :slp1) }
|
40
|
+
end
|
41
|
+
x.report("IAST==>SLP1") do
|
42
|
+
n.times { Sanscript.transliterate(iast_string, :iast, :slp1) }
|
43
|
+
end
|
44
|
+
x.report("Devanagari**>SLP1") do
|
45
|
+
n.times { Sanscript.transliterate(deva_string, :slp1) }
|
46
|
+
end
|
47
|
+
x.report("Devanagari**>IAST") do
|
48
|
+
n.times { Sanscript.transliterate(deva_string, :iast) }
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,77 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
#
|
3
|
+
# Developed from code available @ https://github.com/sanskrit/detect.js
|
4
|
+
#
|
5
|
+
module Sanscript
|
6
|
+
module Detect
|
7
|
+
# Match any character in the block of Brahmic scripts
|
8
|
+
# between Devanagari and Malayalam.
|
9
|
+
RE_BRAHMIC_RANGE = /[\u0900-\u0d7f]/
|
10
|
+
|
11
|
+
# Match each individual Brahmic script.
|
12
|
+
RE_BRAHMIC_SCRIPTS = {
|
13
|
+
devanagari: /\p{Devanagari}/,
|
14
|
+
bengali: /\p{Bengali}/,
|
15
|
+
gurmukhi: /\p{Gurmukhi}/,
|
16
|
+
gujarati: /\p{Gujarati}/,
|
17
|
+
oriya: /\p{Oriya}/,
|
18
|
+
tamil: /\p{Tamil}/,
|
19
|
+
telugu: /\p{Telugu}/,
|
20
|
+
kannada: /\p{Kannada}/,
|
21
|
+
malayalam: /\p{Malayalam}/,
|
22
|
+
}.freeze
|
23
|
+
|
24
|
+
# Match on special Roman characters
|
25
|
+
RE_IAST_OR_KOLKATA_ONLY = /[āīūṛṝḷḹēōṃḥṅñṭḍṇśṣḻ]/i
|
26
|
+
|
27
|
+
# Match on Kolkata-specific Roman characters
|
28
|
+
RE_KOLKATA_ONLY = /[ēō]/i
|
29
|
+
|
30
|
+
# Match on ITRANS-only
|
31
|
+
RE_ITRANS_ONLY = /ee|oo|\^[iI]|RR[iI]|L[iI]|~N|N\^|Ch|chh|JN|sh|Sh|\.a/
|
32
|
+
|
33
|
+
# Match on SLP1-only characters and bigrams
|
34
|
+
RE_SLP1_ONLY = /[fFxXEOCYwWqQPB]|kz|Nk|Ng|tT|dD|Sc|Sn|[aAiIuUfFxXeEoO]R|G[yr]|(\\W|^)G'/
|
35
|
+
|
36
|
+
# Match on Velthuis-only characters
|
37
|
+
RE_VELTHUIS_ONLY = /\.[mhnrlntds]|"n|~s/
|
38
|
+
|
39
|
+
# Match on chars shared by ITRANS and Velthuis
|
40
|
+
RE_ITRANS_OR_VELTHUIS_ONLY = /aa|ii|uu|~n/
|
41
|
+
|
42
|
+
# Match on characters unavailable in Harvard-Kyoto
|
43
|
+
RE_HARVARD_KYOTO = /[aAiIuUeoRMHkgGcjJTDNtdnpbmyrlvzSsh]/
|
44
|
+
|
45
|
+
private_constant :RE_BRAHMIC_RANGE, :RE_BRAHMIC_SCRIPTS, :RE_IAST_OR_KOLKATA_ONLY,
|
46
|
+
:RE_KOLKATA_ONLY, :RE_ITRANS_ONLY, :RE_SLP1_ONLY, :RE_VELTHUIS_ONLY,
|
47
|
+
:RE_ITRANS_OR_VELTHUIS_ONLY, :RE_HARVARD_KYOTO
|
48
|
+
|
49
|
+
module_function
|
50
|
+
|
51
|
+
def detect_script(text)
|
52
|
+
# Brahmic schemes are all within a specific range of code points.
|
53
|
+
if text =~ RE_BRAHMIC_RANGE
|
54
|
+
RE_BRAHMIC_SCRIPTS.each do |script, regex|
|
55
|
+
return script if text =~ regex
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
# Romanizations
|
60
|
+
if text =~ RE_IAST_OR_KOLKATA_ONLY
|
61
|
+
text =~ RE_KOLKATA_ONLY ? :kolkata : :iast
|
62
|
+
elsif text =~ RE_ITRANS_ONLY
|
63
|
+
:itrans
|
64
|
+
elsif text =~ RE_SLP1_ONLY
|
65
|
+
:slp1
|
66
|
+
elsif text =~ RE_VELTHUIS_ONLY
|
67
|
+
:velthuis
|
68
|
+
elsif text =~ RE_ITRANS_OR_VELTHUIS_ONLY
|
69
|
+
:itrans
|
70
|
+
elsif text =~ RE_HARVARD_KYOTO
|
71
|
+
:hk
|
72
|
+
else
|
73
|
+
:unknown
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|