keyphrase 0.2.0 → 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/keyphrase/stoplist/eng.rb +0 -2
- data/lib/keyphrase/stoplist/pan.rb +0 -11
- data/lib/keyphrase/version.rb +1 -1
- data/lib/keyphrase.rb +2 -2
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 59f4193d2821ad7ebe8b7bdf83e5fe9b396f8c38e6ef2fe1d8e24d224a20ba27
|
4
|
+
data.tar.gz: 43ce37834a6316df02476de2efbb17d0c3c6020930697654602336f100eaf3f6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: dedf3654d9d48f58fe151d43810c604936760357a9a9e98a1c3676cff373c838e47ef30b36ca5155a87f4e654a2afb5affbc59313a710f03e5575be73c9266a5
|
7
|
+
data.tar.gz: 77b4a3735afd50d97dd1b71b608f3538b292182c2f92971d92eca592a4b0d75d38c728d2975517a18ae80224f3f27f5869251b14c4c56a3aef4a15bb04e24372
|
@@ -4,7 +4,6 @@
|
|
4
4
|
def self.stopwords
|
5
5
|
@@stopwords ||= [
|
6
6
|
"ਦੇ",
|
7
|
-
"0",
|
8
7
|
"ਵਿੱਚ",
|
9
8
|
"ਦਾ",
|
10
9
|
"ਅਤੇ",
|
@@ -18,7 +17,6 @@
|
|
18
17
|
"ਨੇ",
|
19
18
|
"ਤੇ",
|
20
19
|
"ਨਾਲ",
|
21
|
-
"1",
|
22
20
|
"ਲਈ",
|
23
21
|
"ਵੀ",
|
24
22
|
"ਸੀ",
|
@@ -30,7 +28,6 @@
|
|
30
28
|
"ਹਨ",
|
31
29
|
"ਜਾਂਦਾ",
|
32
30
|
"ਕੀਤਾ",
|
33
|
-
"2",
|
34
31
|
"ਗਿਆ",
|
35
32
|
"ਹੀ",
|
36
33
|
"ਕੇ",
|
@@ -47,7 +44,6 @@
|
|
47
44
|
"ਨਹੀਂ",
|
48
45
|
"ਭਾਰਤੀ",
|
49
46
|
"ਪਿੰਡ",
|
50
|
-
"3",
|
51
47
|
"ਸਿੰਘ",
|
52
48
|
"ਉੱਤੇ",
|
53
49
|
"ਸਾਲ",
|
@@ -65,7 +61,6 @@
|
|
65
61
|
"ਪਰ",
|
66
62
|
"ਦੁਆਰਾ",
|
67
63
|
"ਰੂਪ",
|
68
|
-
"4",
|
69
64
|
"ਹੋਰ",
|
70
65
|
"ਕੰਮ",
|
71
66
|
"ਆਪਣੀ",
|
@@ -80,7 +75,6 @@
|
|
80
75
|
"ਜਾ",
|
81
76
|
"ਵਾਲੇ",
|
82
77
|
"ਸ਼ੁਰੂ",
|
83
|
-
"5",
|
84
78
|
"ਉਸਨੇ",
|
85
79
|
"ਕਿਹਾ",
|
86
80
|
"ਹੋਣ",
|
@@ -103,7 +97,6 @@
|
|
103
97
|
"ਹੁੰਦੇ",
|
104
98
|
"ਸ਼ਹਿਰ",
|
105
99
|
"ਭਾਸ਼ਾ",
|
106
|
-
"6",
|
107
100
|
"ਹੋਈ",
|
108
101
|
"ਅਨੁਸਾਰ",
|
109
102
|
"ਸਕਦਾ",
|
@@ -133,7 +126,6 @@
|
|
133
126
|
"ਨਾਂ",
|
134
127
|
"ਦੌਰਾਨ",
|
135
128
|
"ਤਰ੍ਹਾਂ",
|
136
|
-
"7",
|
137
129
|
"ਯੂਨੀਵਰਸਿਟੀ",
|
138
130
|
"ਨਾ",
|
139
131
|
"ਏ",
|
@@ -149,7 +141,6 @@
|
|
149
141
|
"ਅੰਗਰੇਜ਼ੀ",
|
150
142
|
"ਉਸਨੂੰ",
|
151
143
|
"ਉਹਨਾਂ",
|
152
|
-
"8",
|
153
144
|
"ਸਥਿਤ",
|
154
145
|
"ਫਿਰ",
|
155
146
|
"ਜੀਵਨ",
|
@@ -170,13 +161,11 @@
|
|
170
161
|
"ਉਮਰ",
|
171
162
|
"ਬਲਾਕ",
|
172
163
|
"ਰਹੇ",
|
173
|
-
"10",
|
174
164
|
"ਸਾਹਿਬ",
|
175
165
|
"ਕਰਦੀ",
|
176
166
|
"ਹਰ",
|
177
167
|
"ਪੈਦਾ",
|
178
168
|
"ਘੱਟ",
|
179
|
-
"9",
|
180
169
|
"ਲੇਖਕ",
|
181
170
|
"ਹਿੱਸਾ",
|
182
171
|
"ਫ਼ਿਲਮ",
|
data/lib/keyphrase/version.rb
CHANGED
data/lib/keyphrase.rb
CHANGED
@@ -7,8 +7,8 @@ class Keyphrase
|
|
7
7
|
autoload :Stoplist, "keyphrase/stoplist"
|
8
8
|
|
9
9
|
CLEAN_REGEX = /([^\p{L}a-zA-Z0-9\'\- \.]|(?<!\w)\.)/ # don't remove ' because it might be part of a stop word
|
10
|
-
BLACKLIST_REGEX = /(?:^|\s)[^a-zA-Z\p{L}]+\b|\'|\-/ # remove words with no letters, ie 123.23.12. And last chance to remove ' and -
|
11
|
-
CLEAN_SPACES_REGEX =
|
10
|
+
BLACKLIST_REGEX = /(?:^|\s)[^a-zA-Z\p{L}0-9]+\b|\'|\-/ # remove words with no letters, ie 123.23.12. And last chance to remove ' and -
|
11
|
+
CLEAN_SPACES_REGEX = /^[0-9\s\.]+$|\s+/ # last phase. Remove extra whitespace and lone numbers
|
12
12
|
SENTENCES_REGEX = /[+!?,;:&\[\]\{\}\<\>\=\/\n\t\\"\\(\\)\u2019\u2013\|]|-(?!\w)|'(?=s)|(?<!\s)\.(?![a-zA-Z0-9])|(?<!\w)\#(?=\w)/u
|
13
13
|
|
14
14
|
def self.analyse text, options={}
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: keyphrase
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ben D'Angelo
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2024-01-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rspec
|