RubyGems - persian - Versions diffs - 0.0.0 → 0.2.2 - Mend

persian 0.0.0 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

checksums.yaml +5 -5
data/.editorconfig +9 -0
data/.gitignore +51 -0
data/.rspec +3 -0
data/.rubocop.yml +32 -0
data/.travis.yml +8 -0
data/Gemfile +10 -0
data/Rakefile +36 -0
data/lib/persian/counter.rb +61 -0
data/lib/persian/date.rb +150 -0
data/lib/persian/dynamic.rb +38 -0
data/lib/persian/list/alphabet.rb +107 -0
data/lib/persian/list/character.rb +193 -0
data/lib/persian/list/homonyms.rb +59 -0
data/lib/persian/list/number.rb +168 -0
data/lib/persian/num_text.rb +53 -0
data/lib/persian/number.rb +81 -0
data/lib/persian/text/keyboard.rb +22 -0
data/lib/persian/text/text.rb +214 -0
data/lib/persian/tokenizer.rb +56 -0
data/lib/persian/unicode.rb +42 -0
data/lib/persian/url.rb +25 -0
data/lib/persian/version.rb +2 -1
data/lib/persian.rb +16 -39
data/persian.gemspec +26 -0
data/readme.md +48 -0
data/spec/counter_spec.rb +83 -0
data/spec/dynamic_spec.rb +6 -0
data/spec/num_text_spec.rb +17 -0
data/spec/number_spec.rb +129 -0
data/spec/spec_helper.rb +7 -0
data/spec/text_spec.rb +258 -0
data/spec/tokenizer_spec.rb +31 -0
data/spec/unicode_spec.rb +25 -0
data/spec/url_spec.rb +11 -0
metadata +42 -12

data/lib/persian/text/text.rb ADDED Viewed

@@ -0,0 +1,214 @@
+# -*- coding: UTF-8 -*-
+# Persian module
+module Persian
+  # Persian Text class
+  # Digest Persian texts
+  class Text
+    # Replace Arabic characters with Persian characters.
+    def self.character(text)
+      AR_FA_CHAR.each { |k, v| text.gsub!(k, v) }
+      text
+    end
+    # Remove extra spaces in text
+    def self.remove_extra_spaces(text)
+      text = text.split.join(' ')
+      text = text.split('‌').join('‌')
+      text
+    end
+    # Remove Arabic harecats from text
+    def self.remove_harekats(text)
+      HAREKATS.each { |v| text = text.gsub(v, '') }
+      text
+    end
+    # Remove All barckets
+    def self.remove_brackets(text)
+      BRACKETS.each { |v| text = text.gsub(v, '') }
+      text
+    end
+    # Remove Persian signs
+    def self.remove_signs(text, with = '')
+      return '' if text.nil?
+      SIGNS.each { |v| text = text.gsub(v, with) }
+      text
+    end
+    def self.replace_zwnj_with_space(text)
+      text = text.gsub(/(‌)/, ' ')
+      text
+    end
+    # Replace general brackets with one type brackets
+    # Default: 	0xAB & 	0xBB
+    def self.general_brackets(text, left = '«', right = '»')
+      text = text.gsub(/"(.*?)"/, left + '\1' + right)
+      text = text.gsub(/\[(.*?)\]/, left + '\1' + right)
+      text = text.gsub(/\{(.*?)\}/, left + '\1' + right)
+      text = text.gsub(/\((.*?)\)/, left + '\1' + right)
+      text
+    end
+    # Add '‌ی' after names that end with ه, ا, و
+    def self.fix_y_after_vowel(text)
+      text += '‌ی' if END_VOWEL.include? text[-1]
+      text
+    end
+    # Replace Space with Zero-width none-joiner after می and نمی
+    def self.replace_zwnj_mi(text)
+      mi = 'می'
+      nmi = 'نمی'
+      text.gsub!(/(^|\s)(#{mi}|#{nmi})\s(\S+)/, '\1\2‌\3')
+      text
+    end
+    # Resplace ست with \sاست if lastest character before \s is ا
+    def self.ast(text)
+      a = 'ا'
+      ast = 'است'
+      st = 'ست'
+      text.gsub!(/(#{a})\s(#{ast})/, '\1' + st)
+      text
+    end
+    # Remove keshide from text
+    def self.keshide(text)
+      text.gsub!(/ـ+/, '')
+      text
+    end
+    # Use ی instead of ئ if next char is ی
+    # Example پائیز => پاییز
+    def self.replace_e_y(text)
+      e = 'ئ'
+      y = 'ی'
+      text.gsub!(/#{e}(#{y})/, '\1\1')
+      text
+    end
+    def self.three_dots(text)
+      text.gsub!(/\.{3,}/, '…')
+      text
+    end
+    def self.suffix(text)
+      tar = 'تر'
+      ee = 'ی'
+      n = 'ن'
+      ha = 'ها'
+      ye = 'ی'
+      text.gsub!(/\s+(#{tar}(#{ee}(#{n})?)?)|(#{ha}(#{ye})?)\s+/, '‌\1')
+      text
+    end
+    def self.remove_extra_question_mark(text)
+      mark = '؟'
+      text.gsub!(/(#{mark}){2,}/, '\1')
+      text
+    end
+    def self.add_zwnj(text, point)
+      text = text.scan(/^.{#{point}}|.+/).join('‌')
+      text
+    end
+    def self.remove_question_exclamation(text)
+      question = '؟'
+      exclamation = '!'
+      text.gsub!(/(#{question})+(#{exclamation})+/, '\1\2')
+      text
+    end
+    def self.remove_stopwords(text)
+      stopwords = ['و', 'در', 'به', 'این', 'با', 'از', 'که', 'است', 'را']
+      words = text.scan(/\S+/)
+      keywords = words.select { |word| !stopwords.include?(word) }
+      keywords.join(' ')
+    end
+    def self.remove_space_noghtevirgool(text)
+      noghtevirgool = '؛'
+      text.gsub!(/\s+(#{noghtevirgool})/, '\1')
+      text
+    end
+    def self.remove_signs_after_noghtevirgool(text)
+      signs = '[\.،؛:!؟\-…]'
+      noghtevirgool = '؛'
+      text.gsub!(/(#{noghtevirgool})[#{signs}]+/, '\1')
+      text
+    end
+    def self.space_after_noghtevirgool(text)
+      noghtevirgool = '؛'
+      text.gsub!(/(#{noghtevirgool})(\S)/, '\1 \2')
+      text
+    end
+    def self.remove_noghtevirgool_para_end(text)
+      noghtevirgool = '؛'
+      text.gsub!(/#{noghtevirgool}(\n|$)/, '.\1')
+      text
+    end
+    def self.remove_noghtevirgool_baz_start(text)
+      noghtevirgool = '؛'
+      regex = /([\(\[«])[ ‌]*[#{noghtevirgool}]/
+      text.gsub!(regex, '\1')
+      text
+    end
+    def self.remove_space_before_virgool(text)
+      virgool = '،'
+      text.gsub!(/\s+(#{virgool})/, '\1')
+      text
+    end
+    def self.remove_signs_after_virgool(text)
+      pattern = /(،)([ ‌]+)?([،؛:!؟\-][\.،؛:!؟\-]*|\.(?!\.))/
+      text.gsub!(pattern, '\1\2')
+      text
+    end
+    def self.space_after_virgool(text)
+      virgool = '،'
+      text.gsub!(/(#{virgool})(\S)/, '\1 \2')
+      text
+    end
+    def self.rm_char(text, char)
+      text.gsub!(/(#{char})/, '')
+      text
+    end
+    def self.rm_virgool_in_end(text)
+      text.gsub!(/(،)([ ‌\n]+)?$/, '.\2')
+      text
+    end
+    def self.space_after_dot(text)
+      text.gsub!(/(\.)(\S)/, '\1 \2')
+      text
+    end
+    def self.squeeze(text)
+      text.squeeze
+    end
+    # Remove specific character from end of text
+    # EXample: remove_postfix('پسره','ه')
+    def self.remove_postfix(text, postfix)
+      text.chomp!(postfix)
+      text
+    end
+  end
+end

data/lib/persian/tokenizer.rb ADDED Viewed

@@ -0,0 +1,56 @@
+# -*- coding: UTF-8 -*-
+# Persian module
+module Persian
+  # Persian tokenize class
+  class Tokenizer
+    # Basic persian word tokenizer
+    # Return an array of words
+    def self.tokenize(text)
+      symbols = ['!', '﷼', ':', '؛', '؟', '،', '-', '.']
+      pair_pre = ['(', '{', '«', '<', '[']
+      pair_post = [')', '}', '»', '>', ']']
+      prepost = ["'", '"']
+      # Split text with space characters
+      splits = text.split(/\s/)
+      return [''] if splits.empty?
+      options = symbols + pair_pre + pair_post + prepost
+      pattern = /[^#{Regexp.escape(options.join)}]+/
+      tokens = []
+      splits.each do |split|
+        first, middle, last = split.partition(pattern)
+        tokens << first.split unless first.empty?
+        tokens << middle unless middle.empty?
+        tokens << last.split unless last.empty?
+      end
+      tokens.flatten
+    end
+    def self.tokenize_more(text, num)
+      list = tokenize(text)
+      tokens = []
+      0.upto list.size - num do |i|
+        token = ''
+        0.upto num - 1 do |j|
+          token += list[i + j] + ' '
+        end
+        tokens.push token.strip
+      end
+      tokens
+    end
+    # Split paragraphs
+    # Return an array of paragraphs
+    def self.split_paragraphs(text)
+      text = text.split("\n").reject(&:empty?)
+      text
+    end
+  end
+end

data/lib/persian/unicode.rb ADDED Viewed

@@ -0,0 +1,42 @@
+# -*- coding: UTF-8 -*-
+# Persian module
+module Persian
+  # Persian Unicode class
+  class Unicode
+    def self.codepoint_to_char(char)
+      return [char].pack('U') if char.is_a? Fixnum
+      [char.hex].pack('U')
+    end
+    # Return text between RIGHT-TO-LETF EMBEDDING(U+202B) and Pop Directional Format(U+202C)
+    def self.rle(text)
+      lre_tag = 0x202B
+      pop_tag = 0x202C
+      codepoint_to_char(lre_tag) + text + codepoint_to_char(pop_tag)
+    end
+    # Return text between LETF-TO-RIGHT EMBEDDING(U+202A) and Pop Directional Format(U+202C)
+    def self.lre(text)
+      rle_tag = 0x202A
+      pop_tag = 0x202C
+      codepoint_to_char(rle_tag) + text + codepoint_to_char(pop_tag)
+    end
+    def self.rlo(text)
+      lro_tag = 0x202E
+      pop_tag = 0x202C
+      codepoint_to_char(lro_tag) + text + codepoint_to_char(pop_tag)
+    end
+    def self.lro(text)
+      rlo_tag = 0x202D
+      pop_tag = 0x202C
+      codepoint_to_char(rlo_tag) + text + codepoint_to_char(pop_tag)
+    end
+  end
+end

data/lib/persian/url.rb ADDED Viewed

@@ -0,0 +1,25 @@
+# -*- coding: UTF-8 -*-
+# Persian module
+module Persian
+  # Persian Unicode class
+  class Url
+    def self.urlify(text)
+      # remove brackets
+      text = Text.remove_brackets(text)
+      # remove harekats
+      text = Text.remove_harekats(text)
+      # remove slash and backslash
+      text = text.gsub(%r{(\/||\\)}, '')
+      # remove signs
+      text = Text.remove_signs(text, ' ')
+      # Remove extra spaces
+      text = Text.remove_extra_spaces(text)
+      # trim spaces from start and end of text
+      text = text.strip
+      # replace space with dash
+      text = text.gsub(/\s/, '-')
+      text
+    end
+  end
+end

data/lib/persian/version.rb CHANGED Viewed

@@ -1,5 +1,6 @@
 # -*- coding: UTF-8 -*-
+# Persian module
 module Persian
-  VERSION = '0.0.0'
+  VERSION = '0.2.2'.freeze
 end

data/lib/persian.rb CHANGED Viewed

@@ -1,42 +1,19 @@
 # -*- coding: UTF-8 -*-
-class Persian
-    def self.number num
+# lists
+require 'persian/list/alphabet'
+require 'persian/list/number'
+require 'persian/list/character'
+require 'persian/list/homonyms'
-      if num.is_a? Numeric
-        num = num.to_s
-      end
-      nums = {
-        # english numbers
-        "0" => "۰",
-        "1" => "۱",
-        "2" => "۲",
-        "3" => "۳",
-        "4" => "۴",
-        "5" => "۵",
-        "6" => "۶",
-        "7" => "۷",
-        "8" => "۸",
-        "9" => "۹",
-        #arabic numbers
-        "٠" => "۰",
-        "١" => "۱",
-        "٢" => "۲",
-        "٣" => "۳",
-        "٤" => "۴",
-        "٥" => "۵",
-        "٦" => "۶",
-        "٧" => "۷",
-        "٨" => "۸",
-        "٩" => "۹",
-      }
-      nums.each {|k, v|
-        num.gsub!(k, v)
-      }
-      return num
-    end
-end
+# classes
+require 'persian/number'
+require 'persian/text/text'
+require 'persian/text/keyboard'
+require 'persian/num_text'
+require 'persian/date'
+require 'persian/tokenizer'
+require 'persian/counter'
+require 'persian/unicode'
+require 'persian/dynamic'
+require 'persian/url'

data/persian.gemspec ADDED Viewed

@@ -0,0 +1,26 @@
+# -*- encoding: utf-8 -*-
+lib = File.expand_path('../lib', __FILE__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+require 'persian/version'
+Gem::Specification.new do |s|
+  s.name        = 'persian'
+  s.version     = Persian::VERSION
+  s.date        = '2022-03-25'
+  s.summary     = 'Persian language for ruby.'
+  s.description = 'A set of utilities for Persian language.'
+  s.authors     = ['Dariush Abbasi']
+  s.email       = 'poshtehani@gmail.com'
+  s.files       = `git ls-files`.split("\n")
+  s.test_files  = `git ls-files -- {spec}/*`.split("\n")
+  s.executables =
+    `git ls-files -- bin/*`.split("\n").map { |f| File.basename(f) }
+  s.require_paths = ['lib']
+  s.homepage      =
+    'http://github.com/dariubs/persian.rb'
+  s.license       = 'MIT'
+  s.add_development_dependency 'rspec', '3.4'
+end

data/readme.md ADDED Viewed

@@ -0,0 +1,48 @@
+<p align="center">
+    <img src="https://upload.wikimedia.org/wikipedia/commons/a/a2/Farsi.svg"
+         height="130" alt="Persian ruby gem">
+</p>
+<a href="https://travis-ci.org/negah/persian">
+    <img src="https://travis-ci.org/negah/persian.svg?branch=master"
+            alt="Build Status">
+</a>
+<a href="https://rubygems.org/gems/persian">
+    <img src="https://img.shields.io/badge/gem-persian-orange.svg"
+            alt="Ruby Gems">
+</a>
+<a href="https://rubygems.org/gems/persian">
+    <img src="https://img.shields.io/gem/dv/persian/stable.svg?maxAge=2592000"
+            alt="Ruby Gems downloads">
+</a>
+<a href="https://codeclimate.com/github/negah/persian">
+    <img src="https://codeclimate.com/github/negah/persian/badges/gpa.svg"
+            alt="Code Climate">
+</a>
+<p align="center"><sup><strong> Ruby gem for working with Persian text. </strong></sup></p>
+Install
+-----
+```shell
+gem install persian
+```
+Usage
+-----
+```ruby
+require 'persian'
+```
+Components
+----------
+incomplete.
+License
+-------
+Released under the MIT License.

data/spec/counter_spec.rb ADDED Viewed

@@ -0,0 +1,83 @@
+# -*- coding: UTF-8 -*-
+require 'spec_helper'
+describe 'persian counter methods' do
+  it 'should return a hash of characters with their number of occurrence' do
+    before = 'من غلام قمرم غیر قمر هیچ مگو'
+    after = {
+      'م' => 6,
+      'ن' => 1,
+      ' ' => 6,
+      'غ' => 2,
+      'ل' => 1,
+      'ا' => 1,
+      'ق' => 2,
+      'ر' => 3,
+      'ی' => 2,
+      'ه' => 1,
+      'چ' => 1,
+      'گ' => 1,
+      'و' => 1
+    }
+    arg = 'غ'
+    after_with_arg = 2
+    expect(Persian::Counter.character(before)).to eq(after)
+    expect(Persian::Counter.character(before, arg)).to eq(after_with_arg)
+  end
+  it 'should return a hash of words as key and number of occurrence of word as value' do
+    before = 'پرچم دوران هخامنشی به احتمال زیاد عقابی با بال های گشوده با قرص خورشیدی در پشت سر عقاب بوده است'
+    after = {
+      'پرچم' => 1,
+      'دوران' => 1,
+      'هخامنشی' => 1,
+      'به' => 1,
+      'احتمال' => 1,
+      'زیاد' => 1,
+      'عقابی' => 1,
+      'با' => 2,
+      'بال' => 1,
+      'های' => 1,
+      'گشوده' => 1,
+      'قرص' => 1,
+      'خورشیدی' => 1,
+      'در' => 1,
+      'پشت' => 1,
+      'سر' => 1,
+      'عقاب' => 1,
+      'بوده' => 1,
+      'است' => 1
+    }
+    arg = 'با'
+    after_with_arg = 2
+    expect(Persian::Counter.word(before)).to eq(after)
+    expect(Persian::Counter.word(before, arg)).to eq(after_with_arg)
+  end
+  it 'should return number of paragraphs' do
+    text = "
+یوهانس برامس در سال ۱۸۳۳ در شهر هامبورگ آلمان در خانواده‌ای فقیر به دنیا آمد. تحصیلات ابتدایی موسیقی را نزد پدرش که نوازنده کنترباس بود فرا گرفت.
+برامس با ویولونیست‌های مشهوری چون رمنی و یواخیم آشنا شد و در طول این آشنایی بود که رمنی موسیقی محلی مجارستان را به برامس معرفی کرد و تحت تأثیر آن برامس رقص‌های مجار خود را نوشت.
+"
+    after = 2
+    expect(Persian::Counter.paragraph(text)).to eq(after)
+  end
+  it 'shoud count uniq characters' do
+    text = 'دوستت دارم'
+    size = 8
+    expect(Persian::Counter.uniq_character(text)).to eq(size)
+  end
+  it 'shoud return length of text' do
+    text = 'راهی بزن که آهی بر ساز آن توان زد'
+    size = 33
+    expect(Persian::Counter.character_counter(text)).to eq(size)
+  end
+end

data/spec/dynamic_spec.rb ADDED Viewed

@@ -0,0 +1,6 @@
+# -*- coding: UTF-8 -*-
+require 'spec_helper'
+describe 'persian dynamic methods methods' do
+end

data/spec/num_text_spec.rb ADDED Viewed

@@ -0,0 +1,17 @@
+# -*- coding: UTF-8 -*-
+require 'spec_helper'
+describe 'persian number to character methods' do
+  it 'should convert english numbers to spelled persian character' do
+    before = 1234
+    after = 'یک هزار و دویست و سی و چهار'
+    expect(Persian::NumText.num_to_char(before)).to eq(after)
+  end
+  it 'should convert Persian numbers to spelled persian number' do
+    before = '۲۰۴۸۲۰۴۸'
+    after = 'بیست میلیون و چهارصد و هشتاد و دو هزار و چهل و هشت'
+    expect(Persian::NumText.num_to_char(before)).to eq(after)
+  end
+end