flashtext 0.1.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +43 -2
- data/lib/flashtext/keyword_processor.rb +111 -0
- data/lib/flashtext/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 869b2bf065283da58b746046bc45317ca9a47526
|
4
|
+
data.tar.gz: 0111edfd1206e610a0a8727181fb98abd3bfc5c9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e66b94f29f8912987b4411953b33cb470163a4c6c9409778941b6f4f4722c8305eceb8fbef8b07dca0fcbf4c1f4a12dd767305a4b32b2cba2909815b7a994542
|
7
|
+
data.tar.gz: 62029dadbaca46ebc761596a7a683764dd0d8ff9b89f475787d3288785d86be8fb432eb4b2bb91b8966636c0947b335595169a25f05e1bad503283c8154f0978
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -4,8 +4,7 @@
|
|
4
4
|
|
5
5
|
This module can be used to replace keywords in sentences or extract keywords from sentences. It is based on the [FlashText algorithm](https://arxiv.org/abs/1711.00046)
|
6
6
|
|
7
|
-
More about Flashtext algorithm
|
8
|
-
|
7
|
+
#### More about Flashtext algorithm.
|
9
8
|
The original paper published on [FlashText algorithm](https://arxiv.org/abs/1711.00046>)
|
10
9
|
|
11
10
|
The article published on [Medium freeCodeCamp](https://medium.freecodecamp.org/regex-was-taking-5-days-flashtext-does-it-in-15-minutes-55f04411025f)
|
@@ -22,6 +21,41 @@ API doc
|
|
22
21
|
Documentation can be found at [FlashText Read the Docs](http://www.rubydoc.info/gems/flashtext/)
|
23
22
|
|
24
23
|
## Usage
|
24
|
+
#### Extract keywords
|
25
|
+
```ruby
|
26
|
+
keyword_processor = Flashtext::KeywordProcessor.new
|
27
|
+
# keyword_processor.add_keyword(<unclean name>, <standardised name>)
|
28
|
+
keyword_processor.add_keyword('Big Apple', 'New York')
|
29
|
+
keyword_processor.add_keyword('Bay Area')
|
30
|
+
keywords_found = keyword_processor.extract_keywords('I love Big Apple and Bay Area.')
|
31
|
+
keywords_found
|
32
|
+
#=> ["New York", "Bay Area"]
|
33
|
+
```
|
34
|
+
|
35
|
+
#### Replace keywords
|
36
|
+
```ruby
|
37
|
+
keyword_processor.add_keyword('New Delhi', 'NCR region')
|
38
|
+
new_sentence = keyword_processor.replace_keywords('I love Big Apple and new delhi.')
|
39
|
+
new_sentence
|
40
|
+
#=> "I love New York and NCR region."
|
41
|
+
```
|
42
|
+
|
43
|
+
#### Case Sensitive example
|
44
|
+
```ruby
|
45
|
+
keyword_processor = Flashtext::KeywordProcessor.new(case_sensitive = true)
|
46
|
+
keyword_processor.add_keyword('Big Apple', 'New York')
|
47
|
+
keyword_processor.add_keyword('Bay Area')
|
48
|
+
keywords_found = keyword_processor.extract_keywords('I love big Apple and Bay Area.')
|
49
|
+
keywords_found
|
50
|
+
#=> ['Bay Area']
|
51
|
+
```
|
52
|
+
|
53
|
+
|
54
|
+
Test
|
55
|
+
----------
|
56
|
+
```ruby
|
57
|
+
rspec spec
|
58
|
+
```
|
25
59
|
|
26
60
|
Contribute
|
27
61
|
----------
|
@@ -29,6 +63,13 @@ Contribute
|
|
29
63
|
- Issue Tracker: https://github.com/imran3180/flashtext/issues
|
30
64
|
- Source Code: https://github.com/imran3180/flashtext/issues
|
31
65
|
|
66
|
+
Implementation in other languages
|
67
|
+
---------------------------------
|
68
|
+
|
69
|
+
- Python: https://github.com/vi3k6i5/flashtext (Core Project)
|
70
|
+
- JavaScript: https://github.com/drenther/flashtext.js
|
71
|
+
- Golang: https://github.com/sundy-li/flashtext
|
72
|
+
|
32
73
|
|
33
74
|
## License
|
34
75
|
|
@@ -130,5 +130,116 @@ module Flashtext
|
|
130
130
|
end
|
131
131
|
keywords_extracted
|
132
132
|
end
|
133
|
+
|
134
|
+
def replace_keywords sentence
|
135
|
+
if sentence.nil? || sentence.empty?
|
136
|
+
return sentence
|
137
|
+
end
|
138
|
+
new_sentence = ""
|
139
|
+
original_sentence = sentence
|
140
|
+
sentence = sentence.downcase if not case_sensitive
|
141
|
+
current_word = ""
|
142
|
+
current_hash = keyword_trie_hash
|
143
|
+
current_white_space = ""
|
144
|
+
sequence_end_pos = 0
|
145
|
+
idx = 0
|
146
|
+
sentence_len = sentence.length
|
147
|
+
|
148
|
+
while idx < sentence_len
|
149
|
+
char = sentence[idx]
|
150
|
+
current_word += original_sentence[idx]
|
151
|
+
|
152
|
+
if not word_boundaries.member?(char)
|
153
|
+
current_white_space = char
|
154
|
+
if current_hash.has_key?(_keyword) or current_hash.has_key?(char)
|
155
|
+
# update longest sequence found
|
156
|
+
sequence_found = nil
|
157
|
+
longest_sequence_found = nil
|
158
|
+
is_longer_seq_found = false
|
159
|
+
if current_hash.has_key?(_keyword)
|
160
|
+
sequence_found = current_hash[_keyword]
|
161
|
+
longest_sequence_found = current_hash[_keyword]
|
162
|
+
sequence_end_pos = idx
|
163
|
+
end
|
164
|
+
|
165
|
+
# re look for longest_sequence from this position
|
166
|
+
if current_hash.has_key?(char)
|
167
|
+
current_hash_continued = current_hash[char]
|
168
|
+
current_word_continued = current_word
|
169
|
+
idy = idx + 1
|
170
|
+
while idy < sentence_len
|
171
|
+
inner_char = sentence[idy]
|
172
|
+
current_word_continued += original_sentence[idy]
|
173
|
+
if !word_boundaries.member?(inner_char) and current_hash_continued.has_key?(_keyword)
|
174
|
+
# Update longest sequence found
|
175
|
+
current_white_space = inner_char
|
176
|
+
longest_sequence_found = current_hash_continued[_keyword]
|
177
|
+
sequence_end_pos = idy
|
178
|
+
is_longer_seq_found = true
|
179
|
+
end
|
180
|
+
if current_hash_continued.has_key?(inner_char)
|
181
|
+
current_hash_continued = current_hash_continued[inner_char]
|
182
|
+
else
|
183
|
+
break
|
184
|
+
end
|
185
|
+
idy += 1
|
186
|
+
end
|
187
|
+
if idy == sentence_len # end of sentence reached.
|
188
|
+
if current_hash_continued.member?(_keyword)
|
189
|
+
# update longest sequence found
|
190
|
+
current_white_space = ""
|
191
|
+
longest_sequence_found = current_hash_continued[_keyword]
|
192
|
+
sequence_end_pos = idy
|
193
|
+
is_longer_seq_found = true
|
194
|
+
end
|
195
|
+
end
|
196
|
+
if is_longer_seq_found
|
197
|
+
idx = sequence_end_pos
|
198
|
+
current_word = current_word_continued
|
199
|
+
end
|
200
|
+
end
|
201
|
+
current_hash = keyword_trie_hash
|
202
|
+
if longest_sequence_found
|
203
|
+
new_sentence += (longest_sequence_found + current_white_space)
|
204
|
+
current_word = ''
|
205
|
+
current_white_space = ''
|
206
|
+
else
|
207
|
+
new_sentence += current_word
|
208
|
+
current_word = ''
|
209
|
+
current_white_space = ''
|
210
|
+
end
|
211
|
+
else
|
212
|
+
# we reset current_hash
|
213
|
+
current_hash = keyword_trie_hash
|
214
|
+
new_sentence += current_word
|
215
|
+
current_word = ''
|
216
|
+
current_white_space = ''
|
217
|
+
end
|
218
|
+
elsif current_hash.has_key?(char)
|
219
|
+
# we can continue from this char
|
220
|
+
current_hash = current_hash[char]
|
221
|
+
else
|
222
|
+
# reset current_hash
|
223
|
+
current_hash = keyword_trie_hash
|
224
|
+
idy = idx + 1
|
225
|
+
while idy < sentence_len
|
226
|
+
char = sentence[idy]
|
227
|
+
current_word += original_sentence[idy]
|
228
|
+
break if not word_boundaries.member?(char)
|
229
|
+
idy += 1
|
230
|
+
end
|
231
|
+
idx = idy
|
232
|
+
new_sentence += current_word
|
233
|
+
current_word = ""
|
234
|
+
current_white_space = ""
|
235
|
+
end
|
236
|
+
if idx + 1 >= sentence_len && current_hash.has_key?(_keyword)
|
237
|
+
sequence_found = current_hash[_keyword]
|
238
|
+
new_sentence += sequence_found
|
239
|
+
end
|
240
|
+
idx = idx + 1 # loop increment
|
241
|
+
end
|
242
|
+
return new_sentence
|
243
|
+
end
|
133
244
|
end
|
134
245
|
end
|
data/lib/flashtext/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: flashtext
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Imran
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2018-01-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|