arabic_stemmer 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +31 -0
- data/Gemfile +4 -0
- data/LICENSE +116 -0
- data/README.md +46 -0
- data/Rakefile +10 -0
- data/arabic_stemmer.gemspec +23 -0
- data/lib/arabic_stemmer.rb +55 -0
- data/lib/arabic_stemmer/version.rb +3 -0
- data/test/test_arabic_stemmer.rb +63 -0
- metadata +82 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: c4d98fba862287649d985263b44b0638d038d68b
|
4
|
+
data.tar.gz: eddab32c1720a3c912974e239ea2cb6b5c1169ed
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 5b7fecc125266c1644160ee4682e8c404f8139f838817e271f677c5e930bea7e9f3d27e9c5dbe71e8bdfa264c0aa0551690d9dfa202026195a63ada971abc877
|
7
|
+
data.tar.gz: c66776aecfe00722f799e948e61c134188d67e9addc0ac554325771ce2c21ec1ddcdff1fef7615b8394d16202e74507a397d223fe5e162ab301cac4c2fd21b32
|
data/.gitignore
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
*.rbc
|
2
|
+
capybara-*.html
|
3
|
+
.rspec
|
4
|
+
/log
|
5
|
+
/tmp
|
6
|
+
/db/*.sqlite3
|
7
|
+
/public/system
|
8
|
+
/coverage/
|
9
|
+
/spec/tmp
|
10
|
+
**.orig
|
11
|
+
rerun.txt
|
12
|
+
pickle-email-*.html
|
13
|
+
|
14
|
+
# TODO Comment out these rules if you are OK with secrets being uploaded to the repo
|
15
|
+
config/initializers/secret_token.rb
|
16
|
+
config/secrets.yml
|
17
|
+
|
18
|
+
## Environment normalisation:
|
19
|
+
/.bundle
|
20
|
+
/vendor/bundle
|
21
|
+
|
22
|
+
# these should all be checked in to normalise the environment:
|
23
|
+
# Gemfile.lock, .ruby-version, .ruby-gemset
|
24
|
+
|
25
|
+
# unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
|
26
|
+
.rvmrc
|
27
|
+
|
28
|
+
# if using bower-rails ignore default bower_components path bower.json files
|
29
|
+
/vendor/assets/bower_components
|
30
|
+
*.bowerrc
|
31
|
+
bower.json
|
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,116 @@
|
|
1
|
+
CC0 1.0 Universal
|
2
|
+
|
3
|
+
Statement of Purpose
|
4
|
+
|
5
|
+
The laws of most jurisdictions throughout the world automatically confer
|
6
|
+
exclusive Copyright and Related Rights (defined below) upon the creator and
|
7
|
+
subsequent owner(s) (each and all, an "owner") of an original work of
|
8
|
+
authorship and/or a database (each, a "Work").
|
9
|
+
|
10
|
+
Certain owners wish to permanently relinquish those rights to a Work for the
|
11
|
+
purpose of contributing to a commons of creative, cultural and scientific
|
12
|
+
works ("Commons") that the public can reliably and without fear of later
|
13
|
+
claims of infringement build upon, modify, incorporate in other works, reuse
|
14
|
+
and redistribute as freely as possible in any form whatsoever and for any
|
15
|
+
purposes, including without limitation commercial purposes. These owners may
|
16
|
+
contribute to the Commons to promote the ideal of a free culture and the
|
17
|
+
further production of creative, cultural and scientific works, or to gain
|
18
|
+
reputation or greater distribution for their Work in part through the use and
|
19
|
+
efforts of others.
|
20
|
+
|
21
|
+
For these and/or other purposes and motivations, and without any expectation
|
22
|
+
of additional consideration or compensation, the person associating CC0 with a
|
23
|
+
Work (the "Affirmer"), to the extent that he or she is an owner of Copyright
|
24
|
+
and Related Rights in the Work, voluntarily elects to apply CC0 to the Work
|
25
|
+
and publicly distribute the Work under its terms, with knowledge of his or her
|
26
|
+
Copyright and Related Rights in the Work and the meaning and intended legal
|
27
|
+
effect of CC0 on those rights.
|
28
|
+
|
29
|
+
1. Copyright and Related Rights. A Work made available under CC0 may be
|
30
|
+
protected by copyright and related or neighboring rights ("Copyright and
|
31
|
+
Related Rights"). Copyright and Related Rights include, but are not limited
|
32
|
+
to, the following:
|
33
|
+
|
34
|
+
i. the right to reproduce, adapt, distribute, perform, display, communicate,
|
35
|
+
and translate a Work;
|
36
|
+
|
37
|
+
ii. moral rights retained by the original author(s) and/or performer(s);
|
38
|
+
|
39
|
+
iii. publicity and privacy rights pertaining to a person's image or likeness
|
40
|
+
depicted in a Work;
|
41
|
+
|
42
|
+
iv. rights protecting against unfair competition in regards to a Work,
|
43
|
+
subject to the limitations in paragraph 4(a), below;
|
44
|
+
|
45
|
+
v. rights protecting the extraction, dissemination, use and reuse of data in
|
46
|
+
a Work;
|
47
|
+
|
48
|
+
vi. database rights (such as those arising under Directive 96/9/EC of the
|
49
|
+
European Parliament and of the Council of 11 March 1996 on the legal
|
50
|
+
protection of databases, and under any national implementation thereof,
|
51
|
+
including any amended or successor version of such directive); and
|
52
|
+
|
53
|
+
vii. other similar, equivalent or corresponding rights throughout the world
|
54
|
+
based on applicable law or treaty, and any national implementations thereof.
|
55
|
+
|
56
|
+
2. Waiver. To the greatest extent permitted by, but not in contravention of,
|
57
|
+
applicable law, Affirmer hereby overtly, fully, permanently, irrevocably and
|
58
|
+
unconditionally waives, abandons, and surrenders all of Affirmer's Copyright
|
59
|
+
and Related Rights and associated claims and causes of action, whether now
|
60
|
+
known or unknown (including existing as well as future claims and causes of
|
61
|
+
action), in the Work (i) in all territories worldwide, (ii) for the maximum
|
62
|
+
duration provided by applicable law or treaty (including future time
|
63
|
+
extensions), (iii) in any current or future medium and for any number of
|
64
|
+
copies, and (iv) for any purpose whatsoever, including without limitation
|
65
|
+
commercial, advertising or promotional purposes (the "Waiver"). Affirmer makes
|
66
|
+
the Waiver for the benefit of each member of the public at large and to the
|
67
|
+
detriment of Affirmer's heirs and successors, fully intending that such Waiver
|
68
|
+
shall not be subject to revocation, rescission, cancellation, termination, or
|
69
|
+
any other legal or equitable action to disrupt the quiet enjoyment of the Work
|
70
|
+
by the public as contemplated by Affirmer's express Statement of Purpose.
|
71
|
+
|
72
|
+
3. Public License Fallback. Should any part of the Waiver for any reason be
|
73
|
+
judged legally invalid or ineffective under applicable law, then the Waiver
|
74
|
+
shall be preserved to the maximum extent permitted taking into account
|
75
|
+
Affirmer's express Statement of Purpose. In addition, to the extent the Waiver
|
76
|
+
is so judged Affirmer hereby grants to each affected person a royalty-free,
|
77
|
+
non transferable, non sublicensable, non exclusive, irrevocable and
|
78
|
+
unconditional license to exercise Affirmer's Copyright and Related Rights in
|
79
|
+
the Work (i) in all territories worldwide, (ii) for the maximum duration
|
80
|
+
provided by applicable law or treaty (including future time extensions), (iii)
|
81
|
+
in any current or future medium and for any number of copies, and (iv) for any
|
82
|
+
purpose whatsoever, including without limitation commercial, advertising or
|
83
|
+
promotional purposes (the "License"). The License shall be deemed effective as
|
84
|
+
of the date CC0 was applied by Affirmer to the Work. Should any part of the
|
85
|
+
License for any reason be judged legally invalid or ineffective under
|
86
|
+
applicable law, such partial invalidity or ineffectiveness shall not
|
87
|
+
invalidate the remainder of the License, and in such case Affirmer hereby
|
88
|
+
affirms that he or she will not (i) exercise any of his or her remaining
|
89
|
+
Copyright and Related Rights in the Work or (ii) assert any associated claims
|
90
|
+
and causes of action with respect to the Work, in either case contrary to
|
91
|
+
Affirmer's express Statement of Purpose.
|
92
|
+
|
93
|
+
4. Limitations and Disclaimers.
|
94
|
+
|
95
|
+
a. No trademark or patent rights held by Affirmer are waived, abandoned,
|
96
|
+
surrendered, licensed or otherwise affected by this document.
|
97
|
+
|
98
|
+
b. Affirmer offers the Work as-is and makes no representations or warranties
|
99
|
+
of any kind concerning the Work, express, implied, statutory or otherwise,
|
100
|
+
including without limitation warranties of title, merchantability, fitness
|
101
|
+
for a particular purpose, non infringement, or the absence of latent or
|
102
|
+
other defects, accuracy, or the present or absence of errors, whether or not
|
103
|
+
discoverable, all to the greatest extent permissible under applicable law.
|
104
|
+
|
105
|
+
c. Affirmer disclaims responsibility for clearing rights of other persons
|
106
|
+
that may apply to the Work or any use thereof, including without limitation
|
107
|
+
any person's Copyright and Related Rights in the Work. Further, Affirmer
|
108
|
+
disclaims responsibility for obtaining any necessary consents, permissions
|
109
|
+
or other rights required for any use of the Work.
|
110
|
+
|
111
|
+
d. Affirmer understands and acknowledges that Creative Commons is not a
|
112
|
+
party to this document and has no duty or obligation with respect to this
|
113
|
+
CC0 or use of the Work.
|
114
|
+
|
115
|
+
For more information, please see
|
116
|
+
<http://creativecommons.org/publicdomain/zero/1.0/>
|
data/README.md
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
arabic_stemmer
|
2
|
+
==============
|
3
|
+
|
4
|
+
Lightweight to stem Arabic words
|
5
|
+
|
6
|
+
This small gem allows you to stem Arabic words i.e. to remove the common suffixes and prefixes.
|
7
|
+
It is a lightweight stemmer.
|
8
|
+
|
9
|
+
Examples
|
10
|
+
<br />
|
11
|
+
المعلمون => معلم
|
12
|
+
<br />
|
13
|
+
المدرسة => مدرسة
|
14
|
+
<br />
|
15
|
+
الأجهزة => اجهزة
|
16
|
+
|
17
|
+
Installation
|
18
|
+
============
|
19
|
+
|
20
|
+
Add this line to your application's Gemfile:
|
21
|
+
|
22
|
+
gem 'arabic_stemmer'
|
23
|
+
|
24
|
+
And then execute:
|
25
|
+
|
26
|
+
$ bundle
|
27
|
+
|
28
|
+
Or install it yourself as:
|
29
|
+
|
30
|
+
$ gem install arabic_stemmer
|
31
|
+
|
32
|
+
Usage
|
33
|
+
=====
|
34
|
+
|
35
|
+
word = "المعلمون"
|
36
|
+
result = ArabicStemmer.to_arabic_stem(word)
|
37
|
+
|
38
|
+
Contributing
|
39
|
+
============
|
40
|
+
|
41
|
+
1. Fork it ( https://github.com/shuaibzahda/arabic_stemmer/fork )
|
42
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
43
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
44
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
45
|
+
5. Create a new Pull Request
|
46
|
+
6. Please ensure all unit tests pass. To run them user: rake test
|
data/Rakefile
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'arabic_stemmer/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "arabic_stemmer"
|
8
|
+
spec.version = ArabicStemmer::VERSION
|
9
|
+
spec.authors = ["Shuaib Zahda"]
|
10
|
+
spec.email = ["shuaib.zahda@gmail.com"]
|
11
|
+
spec.summary = %q{Lightweight Arabic word stemmer}
|
12
|
+
spec.description = %q{Stem Arabic word for example: المبرمجون مبرمج}
|
13
|
+
spec.homepage = "https://github.com/shuaibzahda/arabic_stemmer"
|
14
|
+
spec.license = "CC0 1.0"
|
15
|
+
|
16
|
+
spec.files = `git ls-files -z`.split("\x0")
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_development_dependency "bundler", "~> 1.6"
|
22
|
+
spec.add_development_dependency "rake", "~> 0"
|
23
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
require "arabic_stemmer/version"
|
2
|
+
|
3
|
+
module ArabicStemmer
|
4
|
+
def self.to_arabic_stem(word)
|
5
|
+
key_words = ["الله"]
|
6
|
+
|
7
|
+
# 1. Remove non alpha numeric characters.
|
8
|
+
word = word.strip.gsub(/[._,،\"\':;&?؟()]/, '')
|
9
|
+
|
10
|
+
# 2. Remove diacratical marks ً َ ُ ٌ ٍ ِ ْ ّ
|
11
|
+
word = word.gsub(/[\u064b\u064c\u064d\u064e\u064f\u0650\u0651\u0652]/, '')
|
12
|
+
# word = word.gsub(/[\uFE77]/, '')
|
13
|
+
|
14
|
+
# 3. Convert أآإ to ا
|
15
|
+
word = word.gsub(/[\u0622\u0623\u0625]/, 'ا')
|
16
|
+
|
17
|
+
# 4. check against some words that should not be stemmed
|
18
|
+
return word if key_words.include? word or is_word_short(word)
|
19
|
+
|
20
|
+
# 5. remove prefixes
|
21
|
+
word = remove_prefix(word)
|
22
|
+
|
23
|
+
return word if is_word_short(word)
|
24
|
+
|
25
|
+
# 6. remove suffixes
|
26
|
+
word = remove_suffix(word)
|
27
|
+
|
28
|
+
return word
|
29
|
+
end
|
30
|
+
|
31
|
+
def self.remove_suffix(word)
|
32
|
+
# Remove ات ان ون كم ين هم هن
|
33
|
+
suffixes = ["هم", "ين", "ون", "ان", "كم", "ات", "هن"]
|
34
|
+
letters = word[-2, 2]
|
35
|
+
word.slice!(letters) if suffixes.include? letters
|
36
|
+
return word
|
37
|
+
end
|
38
|
+
|
39
|
+
def self.remove_prefix(word)
|
40
|
+
# Remove ال وال كال سي ست, al, wal, kal, saya, sata
|
41
|
+
# check if the word is >= 5
|
42
|
+
prefix_2 = ["ال", "سي", "ست", "لل"]
|
43
|
+
letters = word[0..1]
|
44
|
+
return word[2..-1] if prefix_2.include? letters
|
45
|
+
|
46
|
+
prefix_3 = ["وال", "كال", "بال"]
|
47
|
+
letters = word[0..2]
|
48
|
+
return word[3..-1] if prefix_3.include? letters
|
49
|
+
return word
|
50
|
+
end
|
51
|
+
|
52
|
+
def self.is_word_short(word)
|
53
|
+
return word.size <= 4
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
require 'arabic_stemmer'
|
2
|
+
require 'test/unit'
|
3
|
+
|
4
|
+
class ArabicStemmerTest < Test::Unit::TestCase
|
5
|
+
def test_do_not_stem_keywords
|
6
|
+
assert_equal "الله", ArabicStemmer.to_arabic_stem("الله")
|
7
|
+
end
|
8
|
+
|
9
|
+
def test_remove_non_alphanumeric
|
10
|
+
expected_word = "كلمة"
|
11
|
+
assert_equal expected_word, ArabicStemmer.to_arabic_stem("&كلمة.:;")
|
12
|
+
assert_equal expected_word, ArabicStemmer.to_arabic_stem("كلمة,،_")
|
13
|
+
assert_equal expected_word, ArabicStemmer.to_arabic_stem("كلمة\"\';")
|
14
|
+
assert_equal expected_word, ArabicStemmer.to_arabic_stem("كلمة?؟)(")
|
15
|
+
end
|
16
|
+
|
17
|
+
def test_remove_diacratics
|
18
|
+
assert_equal "ب", ArabicStemmer.to_arabic_stem("بَ")
|
19
|
+
end
|
20
|
+
|
21
|
+
def test_convert_alif
|
22
|
+
assert_equal "الف", ArabicStemmer.to_arabic_stem("ألف")
|
23
|
+
assert_equal "حاكم", ArabicStemmer.to_arabic_stem("حاكم")
|
24
|
+
assert_equal "اسلام", ArabicStemmer.to_arabic_stem("إسلام")
|
25
|
+
end
|
26
|
+
|
27
|
+
def test_long_and_short_words_no_changes
|
28
|
+
assert_equal "ضرب", ArabicStemmer.to_arabic_stem("ضرب")
|
29
|
+
assert_equal "حذاء", ArabicStemmer.to_arabic_stem("حذاء")
|
30
|
+
assert_equal "مقاطعة", ArabicStemmer.to_arabic_stem("مقاطعة")
|
31
|
+
assert_equal "برنامج", ArabicStemmer.to_arabic_stem("برنامج")
|
32
|
+
assert_equal "بريطانيا", ArabicStemmer.to_arabic_stem("بريطانيا")
|
33
|
+
end
|
34
|
+
|
35
|
+
def test_remove_suffixes
|
36
|
+
assert_equal "معلم", ArabicStemmer.to_arabic_stem("معلمان")
|
37
|
+
assert_equal "معلم", ArabicStemmer.to_arabic_stem("معلمون")
|
38
|
+
assert_equal "معلم", ArabicStemmer.to_arabic_stem("معلمات")
|
39
|
+
assert_equal "معلم", ArabicStemmer.to_arabic_stem("معلمين")
|
40
|
+
assert_equal "معلم", ArabicStemmer.to_arabic_stem("معلمهن")
|
41
|
+
assert_equal "معلم", ArabicStemmer.to_arabic_stem("معلمهم")
|
42
|
+
assert_equal "معلم", ArabicStemmer.to_arabic_stem("معلمكم")
|
43
|
+
end
|
44
|
+
|
45
|
+
def test_remove_prefixes
|
46
|
+
assert_equal "الرب", ArabicStemmer.to_arabic_stem("الرب")
|
47
|
+
assert_equal "اسلام", ArabicStemmer.to_arabic_stem("الإسلام")
|
48
|
+
assert_equal "طلبة", ArabicStemmer.to_arabic_stem("الطلبة")
|
49
|
+
assert_equal "ضرب", ArabicStemmer.to_arabic_stem("الضرب")
|
50
|
+
assert_equal "سيف", ArabicStemmer.to_arabic_stem("كالسيف")
|
51
|
+
assert_equal "سيف", ArabicStemmer.to_arabic_stem("والسيف")
|
52
|
+
assert_equal "قول", ArabicStemmer.to_arabic_stem("سيقول")
|
53
|
+
assert_equal "قول", ArabicStemmer.to_arabic_stem("ستقول")
|
54
|
+
assert_equal "برمج", ArabicStemmer.to_arabic_stem("سيبرمج")
|
55
|
+
assert_equal "برمجة", ArabicStemmer.to_arabic_stem("للبرمجة")
|
56
|
+
assert_equal "سيارة", ArabicStemmer.to_arabic_stem("بالسيارة")
|
57
|
+
end
|
58
|
+
|
59
|
+
def test_stem_words
|
60
|
+
assert_equal "مناضل", ArabicStemmer.to_arabic_stem("المناضلين")
|
61
|
+
assert_equal "مستوطن", ArabicStemmer.to_arabic_stem("المستوطنون")
|
62
|
+
end
|
63
|
+
end
|
metadata
ADDED
@@ -0,0 +1,82 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: arabic_stemmer
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.2
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Shuaib Zahda
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-11-14 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.6'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.6'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
description: 'Stem Arabic word for example: المبرمجون مبرمج'
|
42
|
+
email:
|
43
|
+
- shuaib.zahda@gmail.com
|
44
|
+
executables: []
|
45
|
+
extensions: []
|
46
|
+
extra_rdoc_files: []
|
47
|
+
files:
|
48
|
+
- ".gitignore"
|
49
|
+
- Gemfile
|
50
|
+
- LICENSE
|
51
|
+
- README.md
|
52
|
+
- Rakefile
|
53
|
+
- arabic_stemmer.gemspec
|
54
|
+
- lib/arabic_stemmer.rb
|
55
|
+
- lib/arabic_stemmer/version.rb
|
56
|
+
- test/test_arabic_stemmer.rb
|
57
|
+
homepage: https://github.com/shuaibzahda/arabic_stemmer
|
58
|
+
licenses:
|
59
|
+
- CC0 1.0
|
60
|
+
metadata: {}
|
61
|
+
post_install_message:
|
62
|
+
rdoc_options: []
|
63
|
+
require_paths:
|
64
|
+
- lib
|
65
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
66
|
+
requirements:
|
67
|
+
- - ">="
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '0'
|
70
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
71
|
+
requirements:
|
72
|
+
- - ">="
|
73
|
+
- !ruby/object:Gem::Version
|
74
|
+
version: '0'
|
75
|
+
requirements: []
|
76
|
+
rubyforge_project:
|
77
|
+
rubygems_version: 2.4.4
|
78
|
+
signing_key:
|
79
|
+
specification_version: 4
|
80
|
+
summary: Lightweight Arabic word stemmer
|
81
|
+
test_files:
|
82
|
+
- test/test_arabic_stemmer.rb
|