RubyGems - nlp_arabic - Versions diffs - 0.1.0 - Mend

nlp_arabic 0.1.0

Files changed (13) hide show

checksums.yaml +7 -0
data/.gitignore +9 -0
data/.travis.yml +3 -0
data/Gemfile +4 -0
data/README.md +67 -0
data/Rakefile +10 -0
data/bin/console +14 -0
data/bin/setup +7 -0
data/lib/nlp_arabic.rb +263 -0
data/lib/nlp_arabic/characters.rb +102 -0
data/lib/nlp_arabic/version.rb +3 -0
data/nlp_arabic.gemspec +23 -0
metadata +82 -0

checksums.yaml ADDED

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 9dce240342285bde2206509493990d37a44143df
+  data.tar.gz: b55584ef3a20f4b60f1f0637108149922605f06d
+SHA512:
+  metadata.gz: 9d92a384d51125411cca0a89479c7889830b92e4c86eaf29241bad88029bf1bc704a916151b36438737d7de4725eb1d39bd151d077685010419e4d8022824598
+  data.tar.gz: cd0855407749d5b51d22c43eef77027b53d8974d6ec2dcce27f7edee3577d122cd2c884fa26fbc8f04f09f7d0c0cc3f46d090c0dc504e8aba01d9606f5b7d914

data/.gitignore ADDED

@@ -0,0 +1,9 @@
+/.bundle/
+/.yardoc
+/Gemfile.lock
+/_yardoc/
+/coverage/
+/doc/
+/pkg/
+/spec/reports/
+/tmp/

data/.travis.yml ADDED

@@ -0,0 +1,3 @@
+language: ruby
+rvm:
+  - 2.2.0

data/Gemfile ADDED

@@ -0,0 +1,4 @@
+source 'https://rubygems.org'
+# Specify your gem's dependencies in nlp_arabic.gemspec
+gemspec

data/README.md ADDED

@@ -0,0 +1,67 @@
+NlpArabic
+=========
+This gem is intended to contain tools for Arabic Natural Language Processing.
+As of version 0.1, this toolkit gem allows you to:
+1. Clean a text using a stop list. This stop list was generated using the tf-idf score calculated on words from over 900 articles. The words selected have also been checked and validated by hand which resulted in a stop list of over 270 words.
+2. Stem a word or a text. The stemming algorithm used is the ISRI Arabic stemmer. It is described in the following research paper:
+  [Arabic Stemming without a root dictionary](http://ieeexplore.ieee.org/xpl/login.jsp?tp=&arnumber=1428453&url=http%3A%2F%2Fieeexplore.ieee.org%2Fiel5%2F9755%2F30835%2F01428453.pdf%3Farnumber%3D1428453)
+This root-extraction stemmer is similar to the Khoja stemmer but does not use a root-dictionnary which can be laborious to maintain. Also, when the root can not be found, the ISRI stemmer would return a normalized form and not the orginial unmodified form. Overall, the ISRI has been proved to perform equivalently if not better than the Khoja.
+Installation
+============
+Add this line to your application's Gemfile:
+```ruby
+gem 'nlp_arabic'
+```
+And then execute:
+    $ bundle
+Or install it yourself as:
+    $ gem install nlp_arabic
+## Usage
+Once installed, you can use it like this:
+  NlpArabic.clean(text) will return the text without the stop words.
+  NlpArabic.stem(word) will return the word stemmed.
+  NlpArabic.stem_text(text) will stem an entire text.
+  NlpArabic.clean_and_stem(text) will do both.
+  NlpArabic.wash_and_stem(text) will stem the text removing stop words and delimiters from it.
+  NlpArabic.tokenize_text(text) will break the text into an array of words and delimiters.
+Each step of the ISRI algorithm is coded in a separate function so you should be able to find the helper function you may be looking for just by browsing the code.
+Development
+===========
+After checking out the repo, run `bin/console` for an interactive prompt that will allow you to experiment. For now the gem doesn't use any dependencies so you don't need to run `bin/setup`.
+To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release` to create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
+Contributing
+============
+You are more than welcome to contribute to this project :) Please try to respect the ruby style guidelines described [here](https://github.com/bbatsov/ruby-style-guide). The default encoding used is UTF-8.
+1. Fork it ( https://github.com/othmanela/nlp_arabic/fork )
+2. Create your feature branch (`git checkout -b my-new-feature`)
+3. Write unit tests and make sure all of them (including the old ones) pass
+3. Commit your changes (`git commit -am 'Add some feature'`)
+4. Push to the branch (`git push origin my-new-feature`)
+5. Create a new Pull Request

data/Rakefile ADDED

@@ -0,0 +1,10 @@
+require "bundler/gem_tasks"
+require "rake/testtask"
+Rake::TestTask.new do |t|
+  t.libs << "test"
+  t.test_files = FileList["test/**/*_test.rb"]
+  t.verbose = true
+end
+task default: :test

data/bin/console ADDED

@@ -0,0 +1,14 @@
+#!/usr/bin/env ruby
+require "bundler/setup"
+require "nlp_arabic"
+# You can add fixtures and/or initialization code here to make experimenting
+# with your gem easier. You can also use a different console, if you like.
+# (If you use this, don't forget to add pry to your Gemfile!)
+# require "pry"
+# Pry.start
+require "irb"
+IRB.start

data/bin/setup ADDED

@@ -0,0 +1,7 @@
+#!/bin/bash
+set -euo pipefail
+IFS=$'\n\t'
+bundle install
+# Do any other automated setup that you need to do here

data/lib/nlp_arabic.rb ADDED

@@ -0,0 +1,263 @@
+require "nlp_arabic/version"
+require "nlp_arabic/characters"
+module NlpArabic
+  def self.stem(word)
+    # This function stems a word following the steps of ISRI stemmer
+    # Step 1: remove diacritics
+      word = remove_diacritics(word)
+    # Step 2: normalize hamza, ouaou and yeh to bare alef
+      word = normalize_hamzaas(word)
+    # Step 3: remove prefix of size 3 then 2
+      word = remove_prefix(word)
+    # Step 4: remove suffix of size 3 then 2
+      word = remove_suffix(word)
+    # Step 5: remove the connective waw
+      word = remove_waw(word)
+    # Step 6: convert inital alif (optional)
+      word = convert_initial_alef(word)
+    # Step 7: If the length of the word is higher than 3
+    if word.length == 4
+      word = word_4(word)
+    elsif word.length == 5
+      word = pattern_53(word)
+      word = word_5(word)
+    elsif word.length == 6
+      word = pattern_6(word)
+      word = word_6(word)
+    elsif word.length == 7
+      word = short_suffix(word)
+      word = short_prefix(word) if word.length == 7
+      if word.length == 6
+        word = pattern_6(word)
+        word = word_6(word)
+      end
+    end
+    return word
+  end
+  def self.clean_text(text)
+    # cleans the text using a stop list
+    tokenized_text = NlpArabic.tokenize_text(text)
+    clean_text = (tokenized_text - NlpArabic::STOP_LIST)
+    return clean_text.join(' ')
+  end
+  def self.stem_text(text)
+    # Only stems the text using the ISRI algorithm
+    tokenized_text = NlpArabic.tokenize_text(text)
+    for i in (0..(tokenized_text.length-1))
+      tokenized_text[i] = stem(tokenized_text[i]) if NlpArabic.is_alpha(tokenized_text[i])
+    end
+    return tokenized_text.join(' ')
+  end
+  def self.clean_and_stem(text)
+    # Cleans the text using the stop list than stems it
+    tokenized_text = NlpArabic.tokenize_text(text)
+    clean_text = (tokenized_text - NlpArabic::STOP_LIST)
+    for i in (0..(clean_text.length-1))
+      clean_text[i]= stem(clean_text[i]) if NlpArabic.is_alpha(clean_text[i])
+    end
+    return clean_text.join(' ')
+  end
+  def self.tokenize_text(text)
+    return text.split(/\s|(\?+)|(\.+)|(!+)|(\,+)|(\;+)|(\،+)|(\؟+)|(\:+)|(\(+)|(\)+)/).delete_if(&:empty?)
+  end
+  def self.wash_and_stem(text)
+    clean_text = text.gsub(/[._,،\"\':–%\/;·&?؟()\”\“]/, '').split - NlpArabic::STOP_LIST
+    new_text = []
+    for i in (0..(clean_text.length-1))
+      new_text << stem(clean_text[i]) if NlpArabic.is_alpha(clean_text[i])
+    end
+    new_text -= NlpArabic::STOP_LIST
+    return new_text.join(' ')
+  end
+  def self.is_alpha(word)
+    # checks if a word is alphanumeric
+    return !!word.match(/^[[:alpha:]]+$/)
+  end
+  def self.remove_na_characters(word)
+    # cleans the word from non alphanumeric characters
+    return word.strip.gsub(/[._,،\"\':–%\/;·&?؟()\”\“]/, '')
+  end
+  def self.remove_diacritics(word)
+    # removes arabic diacritics (fathatan, dammatan, kasratan, fatha, damma, kasra, shadda, sukun) and tateel
+    return word.gsub(/#{NlpArabic::DIACRITICS}/, '')
+  end
+  def self.convert_initial_alef(word)
+    # converts all the types of ALEF to a bare alef
+    return word.gsub(/#{NlpArabic::ALIFS}/, NlpArabic::ALEF)
+  end
+  def self.normalize_hamzaas(word)
+    # Normalize the hamzaas to an alef
+    return word.gsub(/#{NlpArabic::HAMZAAS}/, NlpArabic::ALEF)
+  end
+  def self.remove_prefix(word)
+    # Removes the prefixes of length three than the prefixes of length two
+    if word.length >= 6
+      return word[3..-1] if word.start_with?(*NlpArabic::P3)
+    end
+    if word.length >= 5
+      return word[2..-1] if word.start_with?(*NlpArabic::P2)
+    end
+    return word
+  end
+  def self.remove_suffix(word)
+    # Removes the suffixes of length three than the prefixes of length two
+    if word.length >= 6
+      return word[0..-4] if word.end_with?(*NlpArabic::S3)
+    end
+    if word.length >= 5
+      return word[0..-3] if word.end_with?(*NlpArabic::S2)
+    end
+    return word
+  end
+  def self.remove_waw(word)
+    # Remove the letter و if it is the initial letter
+    if word.length >= 4
+      return word[1..-1] if word.start_with?(*NlpArabic::DOUBLE_WAW)
+    end
+    return word
+  end
+  def self.word_4(word)
+    # Processes the words of length four
+    if NlpArabic::PR4[0].include? word[0]
+      return word[1..-1]
+    elsif NlpArabic::PR4[1].include? word[1]
+      word[1] = ''
+    elsif NlpArabic::PR4[2].include? word[2]
+      word[2] = ''
+    elsif NlpArabic::PR4[3].include? word[3]
+      word[3] = ''
+    else
+      word = short_suffix(word)
+      word = short_prefix(word) if word.length == 4
+    end
+    return word
+  end
+  def self.word_5(word)
+    # Processes the words of length four
+    if word.length == 4
+      word = word_4(word)
+    elsif word.length == 5
+      word = pattern_54(word)
+    end
+    return word
+  end
+  def self.pattern_53(word)
+    # Helper function that processes the length five patterns and extracts the length three roots
+    if NlpArabic::PR53[0].include? word[2] && word[0] ==  NlpArabic::ALEF
+      word = word[1] + word[3..-1]
+    elsif NlpArabic::PR53[1].include? word[3] && word[0] ==  NlpArabic::MEEM
+      word = word[1..2] + word[4]
+    elsif NlpArabic::PR53[2].include? word[0] && word[4] ==  NlpArabic::TEH_MARBUTA
+      word = word[1..3]
+    elsif NlpArabic::PR53[3].include? word[0] && word[2] ==  NlpArabic::TEH
+      word = word[1] + word[3..-1]
+    elsif NlpArabic::PR53[4].include? word[0] && word[2] ==  NlpArabic::ALEF
+      word = word[1] + word[3..-1]
+    elsif NlpArabic::PR53[5].include? word[2] && word[4] ==  NlpArabic::TEH_MARBUTA
+      word = word[0..1] + word[3]
+    elsif NlpArabic::PR53[6].include? word[0] && word[1] ==  NlpArabic::NOON
+      word = word[2..-1]
+    elsif word[3] ==  NlpArabic::ALEF && word[0] ==  NlpArabic::ALEF
+      word = word[1..2] + word[4]
+    elsif word[4] ==  NlpArabic::NOON && word[3] ==  NlpArabic::ALEF
+      word = word[0..2]
+    elsif word[3] ==  NlpArabic::YEH && word[0] ==  NlpArabic::TEH
+      word = word[1..3] + word[4]
+    elsif word[3] ==  NlpArabic::WAW && word[0] ==  NlpArabic::ALEF
+      word = word[0] + word[2] + word[4]
+    elsif word[2] ==  NlpArabic::ALEF && word[1] ==  NlpArabic::WAW
+      word = word[0] + word[3..-1]
+    elsif word[3] ==  NlpArabic::YEH_WITH_HAMZA_ABOVE && word[2] ==  NlpArabic::ALEF
+      word = word[0..1] + word[4]
+    elsif word[4] ==  NlpArabic::TEH_MARBUTA && word[1] ==  NlpArabic::ALEF
+      word = word[0] + word[2..3]
+    elsif word[4] ==  NlpArabic::YEH && word[2] ==  NlpArabic::ALEF
+      word = word[0..1] + word[3]
+    else
+      word = short_suffix(word)
+      word = short_prefix(word)if word.length == 5
+    end
+    return word
+  end
+  def self.pattern_54(word)
+    # Helper function that processes the length five patterns and extracts the length three roots
+    if NlpArabic::PR53[2].include? word[0]
+      word = word[1..-1]
+    elsif word[4] == NlpArabic::TEH_MARBUTA
+      word = word[0..3]
+    elsif word[2] == NlpArabic::ALEF
+      word = word[0..1] + word[3..-1]
+    end
+    return word
+  end
+  def self.word_6(word)
+    # Processes the words of length four
+    if word.length == 5
+      word = pattern_53(word)
+      word = word_5(word)
+    elsif word.length == 6
+      word = pattern_64(word)
+    end
+    return word
+  end
+  def self.pattern_6(word)
+    # Helper function that processes the length six patterns and extracts the length three roots
+    if word.start_with?(*NlpArabic::IST) || word.start_with?(*NlpArabic::MST)
+      word = word[3..-1]
+    elsif word[0] == NlpArabic::MEEM && word[3] == NlpArabic::ALEF && word[5] == NlpArabic::TEH_MARBUTA
+      word = word[1..2] + word[4]
+    elsif word[0] == NlpArabic::ALEF && word[2] == NlpArabic::TEH && word[4] == NlpArabic::ALEF
+      word = word[1] + word[3] + word[5]
+    elsif word[0] == NlpArabic::ALEF && word[3] == NlpArabic::WAW && word[2] == word[4]
+      word = word[1] + word[4..-1]
+    elsif word[0] == NlpArabic::TEH && word[2] == NlpArabic::ALEF && word[4] ==  NlpArabic::YEH
+      word = word[1] + word[3] + word[5]
+    else
+      word = short_suffix(word)
+      word = short_prefix(word) if word.length == 6
+    end
+    return word
+  end
+  def self.pattern_64(word)
+    # Helper function that processes the length six patterns and extracts the length four roots
+    if word[0] == NlpArabic::ALEF && word[4] == NlpArabic::ALEF
+      word = word[1..3] + word[5]
+    elsif
+      word = word[2..-1]
+    end
+    return word
+  end
+  def self.short_prefix(word)
+    # Removes the short prefixes
+    word[1..-1] if word.start_with?(*NlpArabic::P1)
+    return word
+  end
+  def self.short_suffix(word)
+    # Removes the short suffixes
+    word[0..-2] if word.end_with?(*NlpArabic::S1)
+    return word
+  end
+end

data/lib/nlp_arabic/characters.rb ADDED

@@ -0,0 +1,102 @@
+module NlpArabic
+# Stop List
+  STOP_LIST = ["\u0648","\u064a\u0643\u0648\u0646","\u0644\u064A\u0633","\u0648\u0644\u064a\u0633","\u0648\u0643\u0627\u0646","\u0643\u0630\u0644\u0643","\u0627\u0644\u062a\u064a","\u0648\u0628\u064a\u0646",
+              "\u0639\u0644\u064a\u0647\u0627","\u0639\u0644\u064A","\u0645\u0633\u0627\u0621","\u0627\u0644\u0630\u064a","\u0648\u0643\u0627\u0646\u062a","\u0644\u0643\u0646","\u0648\u0644\u0643\u0646","\u0648\u0627\u0644\u062a\u064a",
+              "\u062a\u0643\u0648\u0646","\u0627\u0644\u064a\u0648\u0645","\u0627\u0644\u0644\u0630\u064a\u0646","\u0639\u0644\u064a\u0647","\u0643\u0627\u0646\u062a",
+              "\u0644\u0630\u0644\u0643","\u0623\u0645\u0627\u0645","\u0647\u0646\u0627","\u0647\u0646\u0627\u0643","\u0645\u0646\u0647\u0627","\u0645\u0627\u0632\u0627\u0644","\u0644\u0627\u0632\u0627\u0644",
+              "\u0644\u0627\u064a\u0632\u0627\u0644","\u0645\u0627\u064a\u0632\u0627\u0644","\u0627\u0635\u0628\u062d","\u0623\u0635\u0628\u062d","\u0623\u0645\u0633\u0649",
+              "\u0627\u0645\u0633\u0649","\u0623\u0636\u062d\u0649","\u0627\u0636\u062d\u0649","\u0645\u0627\u0628\u0631\u062d","\u0645\u0627\u0641\u062a\u0626","\u0645\u0627\u0627\u0646\u0641\u0643",
+              "\u0644\u0627\u0633\u064a\u0645\u0627","\u0648\u0644\u0627\u064a\u0632\u0627\u0644","\u0627\u0644\u062d\u0627\u0644\u064a","\u0627\u0644\u064a\u0647\u0627","\u0627\u0644\u0630\u064a\u0646","\u0641\u0627\u0646\u0647",
+              "\u0648\u0627\u0644\u0630\u064a","\u0648\u0647\u0630\u0627","\u0644\u0647\u0630\u0627","\u0641\u0643\u0627\u0646","\u0633\u062a\u0643\u0648\u0646","\u0627\u0644\u064a\u0647",
+              "\u064a\u0645\u0643\u0646","\u0628\u0647\u0630\u0627","\u0627\u0644\u0630\u0649","\u0641\u0649","\u0641\u064a","\u0643\u0644","\u0644\u0645","\u0644\u0646","\u0644\u0647","\u0645\u0646","\u0647\u0648",
+              "\u0643\u0645\u0627","\u0644\u0647\u0627","\u0645\u0646\u0630","\u0642\u062F","\u0648\u0642\u062F","\u0648\u0644\u0627","\u0648\u0642\u0627\u0644","\u0648\u0642\u0627\u0644\u062A",
+              "\u0644\u0644\u0627\u0645\u0645","\u0641\u064A\u0647","\u0643\u0644\u0645","\u0648\u0641\u064A","\u0648\u0642\u0641","\u0648\u0644\u0645","\u0648\u0645\u0646","\u0648\u0647\u0648","\u0648\u0647\u064A",
+              "\u062D\u064A\u062B","\u0627\u0643\u062F","\u0627\u0644\u0627","\u0627\u0645\u0627","\u0627\u0645\u0633","\u0627\u0644\u0633\u0627\u0628\u0642","\u0627\u0644\u062A\u0649","\u0627\u0643\u062B\u0631",
+              "\u0627\u064A\u0627\u0631","\u0627\u064A\u0636\u0627","\u0627\u0644\u0630\u0627\u062A\u064A","\u0627\u0644\u0627\u062E\u064A\u0631\u0629","\u0627\u0644\u0627\u0646","\u0627\u0645\u0627\u0645","\u0627\u064A\u0627\u0645",
+              "\u062E\u0644\u0627\u0644","\u062D\u0648\u0627\u0644\u0649","\u0630\u0644\u0643","\u062F\u0648\u0646","\u062D\u0648\u0644","\u062D\u064A\u0646","\u0627\u0644\u0641","\u0627\u0644\u0649","\u0648\u062A\u0645",
+              "\u0627\u0646\u0647","\u0627\u0648\u0644","\u0636\u0645\u0646","\u0627\u0646\u0647\u0627","\u062C\u0645\u064A\u0639","\u0627\u0644\u0645\u0627\u0636\u064A","\u0627\u0644\u0648\u0642\u062A",
+              "\u0627\u0644\u0645\u0642\u0628\u0644","\u0644\u0627","\u0645\u0627","\u0645\u0639","\u0647\u0630\u0627","\u0648\u0627\u062D\u062F","\u0641\u0627\u0646","\u0642\u0627\u0644","\u0643\u0627\u0646",
+              "\u0644\u062F\u0649","\u0646\u062D\u0648","\u0647\u0630\u0647","\u0648\u0627\u0646","\u0648\u0627\u0643\u062F","\u0639\u0634\u0631","\u0639\u062F\u062F","\u0639\u062F\u0629","\u0639\u0634\u0631\u0629","\u0639\u062F\u0645",
+              "\u0639\u0627\u0645","\u0639\u0627\u0645\u0627","\u0639\u0646","\u0639\u0646\u062F","\u0639\u0646\u062F\u0645\u0627","\u0639\u0644\u0649","\u0633\u0646\u0629","\u0633\u0646\u0648\u0627\u062A","\u062A\u0645","\u0636\u062F",
+              "\u0628\u0639\u062F","\u0628\u0639\u0636","\u0627\u0639\u0627\u062F\u0629","\u0627\u0639\u0644\u0646\u062A","\u0628\u0633\u0628\u0628","\u062D\u062A\u0649","\u0627\u0630\u0627","\u0627\u062D\u062F","\u0645\u0645\u0646",
+              "\u0627\u062B\u0631","\u063A\u062F\u0627","\u0634\u062E\u0635\u0627","\u0635\u0628\u0627\u062D","\u0627\u0637\u0627\u0631","\u0627\u0631\u0628\u0639\u0629","\u0627\u062E\u0631\u0649","\u0628\u0627\u0646",
+              "\u0627\u062C\u0644","\u063A\u064A\u0631","\u0628\u0634\u0643\u0644","\u062D\u0627\u0644\u064A\u0627","\u0628\u0646","\u0628\u0647","\u062B\u0645","\u0627\u0641","\u0627\u0646","\u0627\u0648","\u0627\u064A",
+              "\u0628\u0647\u0627","\u0635\u0641\u0631","\u0627\u0644\u062B\u0627\u0646\u064A","\u0627\u0644\u062B\u0627\u0646\u064A\u0629","\u0627\u062F\u0627","\u0627\u0648\u0644\u0627","\u0648\u0644\u0643\u0646\u0647",
+              "\u0627\u0644\u0627\u0648\u0644","\u0627\u0644\u0627\u0648\u0644\u0649","\u0628\u064A\u0646","\u0630\u0644\u0643","\u0645\u0645\u0627","\u0631\u063A\u0645","\u0628\u064A","\u0644\u0627\u0646","\u0647\u0644","\u0644\u0648",
+              "\u0628\u0645\u0627","\u0627\u0646\u0627","\u062A\u064A","\u0628\u0644\u0627","\u0642\u0628\u0644","\u0627\u0644\u0646","\u064A\u0627\u0647","\u0644\u062F\u064A","\u0628\u0644","\u0644\u0646\u0627","\u0627\u0645",
+              "\u0627\u0646\u0646\u0627","\u0644\u0642\u062F","\u062D\u064A\u062A","\u0627\u0630\u0646","\u0627\u0644\u064A","\u0628\u0630\u0644\u0643","\u062E\u0644\u0644","\u062D\u0648\u0644","\u0644\u0643","\u062A\u0645\u0627",
+              "\u0644\u0645\u0646","\u0644\u0646\u0647","\u0627\u0644\u0627","\u0627\u064A\u0646","\u0639\u0645\u0627","\u0628\u0643\u0644","\u0648\u0647\u0646\u0627\u0643","\u0646\u0647\u0627",
+              "\u0648\u0647\u0630\u0647","\u0648\u0645\u0627","\u0647\u0645\u0627","\u0648\u0647\u0645","\u0644\u0647\u0630\u0647","\u0639\u0646\u0647","\u0645\u062A\u0646","\u0644\u0645\u0627","\u0643\u0645","\u0645\u062A\u0649",
+              "\u0647\u0643\u0630\u0627","\u0627\u064A\u0647","\u0644\u0643\u0646\u0647","\u062A\u0645","\u0644\u064A\u0643","\u0648\u0644\u0643","\u0644\u0645\u0630\u0627","\u062C\u062F","\u0641\u0641\u064A","\u062F\u064A","\u0625\u064A",
+              "\u0635\u0641\u0631","\u0648\u0627\u062D\u062F","\u0627\u062B\u0646\u0627\u0646","\u062B\u0644\u0627\u062B\u0629","\u0623\u0631\u0628\u0639\u0629","\u062E\u0645\u0633\u0629","\u0633\u062A\u0629","\u0633\u0628\u0639\u0629",
+              "\u062B\u0645\u0627\u0646\u064A\u0629","\u062A\u0633\u0639\u0629","\u0639\u0634\u0631\u0629","\u0639\u0634\u0631","\u0623\u062D\u062F",
+              "\u0627\u062B\u0646\u0627","\u062B\u0644\u0627\u062B\u0629","\u0623\u0631\u0628\u0639\u0629","\u062E\u0645\u0633\u0629","\u0633\u062A\u0629",
+              "\u0633\u0628\u0639\u0629","\u062B\u0645\u0627\u0646\u064A\u0629","\u062A\u0633\u0639\u0629","\u0639\u0634\u0631\u0648\u0646","\u062B\u0644\u0627\u062B\u0648\u0646",
+              "\u0623\u0631\u0628\u0639\u0648\u0646","\u062E\u0645\u0633\u0648\u0646","\u0633\u062A\u0648\u0646","\u0633\u0628\u0639\u0648\u0646","\u062B\u0645\u0627\u0646\u0648\u0646","\u062A\u0633\u0639\u0648\u0646","\u0645\u0626\u0629",
+              "\u0645\u0627\u0626\u0629","\u0623\u0646\u0627","\u0627\u0646\u062A","\u0627\u0646\u062A\u064E","\u0627\u0646\u062A\u0649","\u0627\u0646\u062A\u0650","\u0647\u0648","\u0647\u064A","\u0646\u062D\u0646","\u0623\u0646\u062A\u0645\u0627",
+              "\u0647\u0645\u0627","\u0623\u0646\u062A\u0645","\u0623\u0646\u062A\u0646","\u0647\u0645","\u0647\u0646"].freeze
+# Diacritics
+  DIACRITICS = "[\u064b\u064c\u064d\u064e\u064f\u0650\u0651\u0652\u0640]"
+# Alifs
+  # Initial Alifs
+  ALIFS  = "[\u0622\u0623\u0625\u0671]"
+  # Hamzaas
+  HAMZAAS = "[\u0621\u0624\u0626]"
+# Affix sets
+  # Prefixes of length three
+  P3 = ["\u0643\u0627\u0644", "\u0628\u0627\u0644", "\u0648\u0644\u0644", "\u0648\u0627\u0644"]
+  # Prefixes of length two
+  P2 = ["\u0627\u0644", "\u0644\u0644"].freeze
+  # Prefixes of length one
+  P1 = ["\u0644", "\u0628", "\u0641", "\u0633", "\u0648","\u064a", "\u062a", "\u0646", "\u0627"].freeze
+  # Suffixes of length three
+  S3 = ["\u062a\u0645\u0644", "\u0647\u0645\u0644","\u062a\u0627\u0646", "\u062a\u064a\u0646","\u0643\u0645\u0644"].freeze
+  # Suffixes of length two
+  S2 = ["\u0648\u0646", "\u0627\u062a", "\u0627\u0646","\u064a\u0646", "\u062a\u0646", "\u0643\u0645","\u0647\u0646", "\u0646\u0627", "\u064a\u0627",
+        "\u0647\u0627", "\u062a\u0645", "\u0643\u0646","\u0646\u064a", "\u0648\u0627", "\u0645\u0627","\u0647\u0645"].freeze
+  # Suffixes of length one
+  S1 = ["\u0629", "\u0647", "\u064a", "\u0643", "\u062a","\u0627", "\u0646"].freeze
+# Patterns and roots
+  # Pattern of length four
+  PR4 = { 0 => ["\u0645"],
+          1 => ["\u0627"],
+          2 => ["\u0627", "\u0648", "\u064A"],
+          3 => ["\u0629"]}.freeze
+  # Pattern of length five and length three roots
+  PR53 = {0 => ["\u0627", "\u062a"],
+          1 => ["\u0627", "\u064a", "\u0648"],
+          2 => ["\u0627", "\u062a", "\u0645"],
+          3 => ["\u0645", "\u064a", "\u062a"],
+          4 => ["\u0645", "\u062a"],
+          5 => ["\u0627", "\u0648"],
+          6 => ["\u0627", "\u0645"]}.freeze
+  # Letters
+  DOUBLE_WAW  = "\u0648\u0648"
+  ALEF        = "\u0627"
+  MEEM        = "\u0645"
+  TEH_MARBUTA = "\u0629"
+  TEH         = "\u062a"
+  NOON        = "\u0646"
+  YEH         = "\u064a"
+  WAW         = "\u0648"
+  YEH_WITH_HAMZA_ABOVE = "\u0626"
+  #STEMS
+  IST         = "\u0627\u0633\u062a"
+  MST         = "\u0645\u0633\u062a"
+  MT          = "\u0645\u062a"
+end

data/lib/nlp_arabic/version.rb ADDED

@@ -0,0 +1,3 @@
+module NlpArabic
+  VERSION = "0.1.0"
+end

data/nlp_arabic.gemspec ADDED

@@ -0,0 +1,23 @@
+# coding: utf-8
+lib = File.expand_path('../lib', __FILE__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+require 'nlp_arabic/version'
+Gem::Specification.new do |spec|
+  spec.name          = "nlp_arabic"
+  spec.version       = NlpArabic::VERSION
+  spec.authors       = ["Othmane Laousy"]
+  spec.email         = ["othmane.laousy@gmail.com"]
+  spec.summary       = %q{Natural Language Processing Tools for Arabic}
+  spec.description   = %q{This gem is intended to contain tools for Arabic Natural Language Processing.}
+  spec.homepage      = "https://github.com/othmanela/nlp_arabic"
+  spec.files         = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test)/}) }
+  spec.bindir        = "exe"
+  spec.executables   = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
+  spec.require_paths = ["lib"]
+  spec.add_development_dependency "bundler", "~> 1.9"
+  spec.add_development_dependency "rake", "~> 10.0"
+end

metadata ADDED

@@ -0,0 +1,82 @@
+--- !ruby/object:Gem::Specification
+name: nlp_arabic
+version: !ruby/object:Gem::Version
+  version: 0.1.0
+platform: ruby
+authors:
+- Othmane Laousy
+autorequire:
+bindir: exe
+cert_chain: []
+date: 2015-05-11 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: bundler
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.9'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.9'
+- !ruby/object:Gem::Dependency
+  name: rake
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '10.0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '10.0'
+description: This gem is intended to contain tools for Arabic Natural Language Processing.
+email:
+- othmane.laousy@gmail.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- ".gitignore"
+- ".travis.yml"
+- Gemfile
+- README.md
+- Rakefile
+- bin/console
+- bin/setup
+- lib/nlp_arabic.rb
+- lib/nlp_arabic/characters.rb
+- lib/nlp_arabic/version.rb
+- nlp_arabic.gemspec
+homepage: https://github.com/othmanela/nlp_arabic
+licenses: []
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 2.4.6
+signing_key:
+specification_version: 4
+summary: Natural Language Processing Tools for Arabic
+test_files: []