flashtext 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +43 -2
- data/lib/flashtext/keyword_processor.rb +111 -0
- data/lib/flashtext/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 869b2bf065283da58b746046bc45317ca9a47526
|
4
|
+
data.tar.gz: 0111edfd1206e610a0a8727181fb98abd3bfc5c9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e66b94f29f8912987b4411953b33cb470163a4c6c9409778941b6f4f4722c8305eceb8fbef8b07dca0fcbf4c1f4a12dd767305a4b32b2cba2909815b7a994542
|
7
|
+
data.tar.gz: 62029dadbaca46ebc761596a7a683764dd0d8ff9b89f475787d3288785d86be8fb432eb4b2bb91b8966636c0947b335595169a25f05e1bad503283c8154f0978
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -4,8 +4,7 @@
|
|
4
4
|
|
5
5
|
This module can be used to replace keywords in sentences or extract keywords from sentences. It is based on the [FlashText algorithm](https://arxiv.org/abs/1711.00046)
|
6
6
|
|
7
|
-
More about Flashtext algorithm
|
8
|
-
|
7
|
+
#### More about Flashtext algorithm.
|
9
8
|
The original paper published on [FlashText algorithm](https://arxiv.org/abs/1711.00046>)
|
10
9
|
|
11
10
|
The article published on [Medium freeCodeCamp](https://medium.freecodecamp.org/regex-was-taking-5-days-flashtext-does-it-in-15-minutes-55f04411025f)
|
@@ -22,6 +21,41 @@ API doc
|
|
22
21
|
Documentation can be found at [FlashText Read the Docs](http://www.rubydoc.info/gems/flashtext/)
|
23
22
|
|
24
23
|
## Usage
|
24
|
+
#### Extract keywords
|
25
|
+
```ruby
|
26
|
+
keyword_processor = Flashtext::KeywordProcessor.new
|
27
|
+
# keyword_processor.add_keyword(<unclean name>, <standardised name>)
|
28
|
+
keyword_processor.add_keyword('Big Apple', 'New York')
|
29
|
+
keyword_processor.add_keyword('Bay Area')
|
30
|
+
keywords_found = keyword_processor.extract_keywords('I love Big Apple and Bay Area.')
|
31
|
+
keywords_found
|
32
|
+
#=> ["New York", "Bay Area"]
|
33
|
+
```
|
34
|
+
|
35
|
+
#### Replace keywords
|
36
|
+
```ruby
|
37
|
+
keyword_processor.add_keyword('New Delhi', 'NCR region')
|
38
|
+
new_sentence = keyword_processor.replace_keywords('I love Big Apple and new delhi.')
|
39
|
+
new_sentence
|
40
|
+
#=> "I love New York and NCR region."
|
41
|
+
```
|
42
|
+
|
43
|
+
#### Case Sensitive example
|
44
|
+
```ruby
|
45
|
+
keyword_processor = Flashtext::KeywordProcessor.new(case_sensitive = true)
|
46
|
+
keyword_processor.add_keyword('Big Apple', 'New York')
|
47
|
+
keyword_processor.add_keyword('Bay Area')
|
48
|
+
keywords_found = keyword_processor.extract_keywords('I love big Apple and Bay Area.')
|
49
|
+
keywords_found
|
50
|
+
#=> ['Bay Area']
|
51
|
+
```
|
52
|
+
|
53
|
+
|
54
|
+
Test
|
55
|
+
----------
|
56
|
+
```ruby
|
57
|
+
rspec spec
|
58
|
+
```
|
25
59
|
|
26
60
|
Contribute
|
27
61
|
----------
|
@@ -29,6 +63,13 @@ Contribute
|
|
29
63
|
- Issue Tracker: https://github.com/imran3180/flashtext/issues
|
30
64
|
- Source Code: https://github.com/imran3180/flashtext/issues
|
31
65
|
|
66
|
+
Implementation in other languages
|
67
|
+
---------------------------------
|
68
|
+
|
69
|
+
- Python: https://github.com/vi3k6i5/flashtext (Core Project)
|
70
|
+
- JavaScript: https://github.com/drenther/flashtext.js
|
71
|
+
- Golang: https://github.com/sundy-li/flashtext
|
72
|
+
|
32
73
|
|
33
74
|
## License
|
34
75
|
|
@@ -130,5 +130,116 @@ module Flashtext
|
|
130
130
|
end
|
131
131
|
keywords_extracted
|
132
132
|
end
|
133
|
+
|
134
|
+
def replace_keywords sentence
|
135
|
+
if sentence.nil? || sentence.empty?
|
136
|
+
return sentence
|
137
|
+
end
|
138
|
+
new_sentence = ""
|
139
|
+
original_sentence = sentence
|
140
|
+
sentence = sentence.downcase if not case_sensitive
|
141
|
+
current_word = ""
|
142
|
+
current_hash = keyword_trie_hash
|
143
|
+
current_white_space = ""
|
144
|
+
sequence_end_pos = 0
|
145
|
+
idx = 0
|
146
|
+
sentence_len = sentence.length
|
147
|
+
|
148
|
+
while idx < sentence_len
|
149
|
+
char = sentence[idx]
|
150
|
+
current_word += original_sentence[idx]
|
151
|
+
|
152
|
+
if not word_boundaries.member?(char)
|
153
|
+
current_white_space = char
|
154
|
+
if current_hash.has_key?(_keyword) or current_hash.has_key?(char)
|
155
|
+
# update longest sequence found
|
156
|
+
sequence_found = nil
|
157
|
+
longest_sequence_found = nil
|
158
|
+
is_longer_seq_found = false
|
159
|
+
if current_hash.has_key?(_keyword)
|
160
|
+
sequence_found = current_hash[_keyword]
|
161
|
+
longest_sequence_found = current_hash[_keyword]
|
162
|
+
sequence_end_pos = idx
|
163
|
+
end
|
164
|
+
|
165
|
+
# re look for longest_sequence from this position
|
166
|
+
if current_hash.has_key?(char)
|
167
|
+
current_hash_continued = current_hash[char]
|
168
|
+
current_word_continued = current_word
|
169
|
+
idy = idx + 1
|
170
|
+
while idy < sentence_len
|
171
|
+
inner_char = sentence[idy]
|
172
|
+
current_word_continued += original_sentence[idy]
|
173
|
+
if !word_boundaries.member?(inner_char) and current_hash_continued.has_key?(_keyword)
|
174
|
+
# Update longest sequence found
|
175
|
+
current_white_space = inner_char
|
176
|
+
longest_sequence_found = current_hash_continued[_keyword]
|
177
|
+
sequence_end_pos = idy
|
178
|
+
is_longer_seq_found = true
|
179
|
+
end
|
180
|
+
if current_hash_continued.has_key?(inner_char)
|
181
|
+
current_hash_continued = current_hash_continued[inner_char]
|
182
|
+
else
|
183
|
+
break
|
184
|
+
end
|
185
|
+
idy += 1
|
186
|
+
end
|
187
|
+
if idy == sentence_len # end of sentence reached.
|
188
|
+
if current_hash_continued.member?(_keyword)
|
189
|
+
# update longest sequence found
|
190
|
+
current_white_space = ""
|
191
|
+
longest_sequence_found = current_hash_continued[_keyword]
|
192
|
+
sequence_end_pos = idy
|
193
|
+
is_longer_seq_found = true
|
194
|
+
end
|
195
|
+
end
|
196
|
+
if is_longer_seq_found
|
197
|
+
idx = sequence_end_pos
|
198
|
+
current_word = current_word_continued
|
199
|
+
end
|
200
|
+
end
|
201
|
+
current_hash = keyword_trie_hash
|
202
|
+
if longest_sequence_found
|
203
|
+
new_sentence += (longest_sequence_found + current_white_space)
|
204
|
+
current_word = ''
|
205
|
+
current_white_space = ''
|
206
|
+
else
|
207
|
+
new_sentence += current_word
|
208
|
+
current_word = ''
|
209
|
+
current_white_space = ''
|
210
|
+
end
|
211
|
+
else
|
212
|
+
# we reset current_hash
|
213
|
+
current_hash = keyword_trie_hash
|
214
|
+
new_sentence += current_word
|
215
|
+
current_word = ''
|
216
|
+
current_white_space = ''
|
217
|
+
end
|
218
|
+
elsif current_hash.has_key?(char)
|
219
|
+
# we can continue from this char
|
220
|
+
current_hash = current_hash[char]
|
221
|
+
else
|
222
|
+
# reset current_hash
|
223
|
+
current_hash = keyword_trie_hash
|
224
|
+
idy = idx + 1
|
225
|
+
while idy < sentence_len
|
226
|
+
char = sentence[idy]
|
227
|
+
current_word += original_sentence[idy]
|
228
|
+
break if not word_boundaries.member?(char)
|
229
|
+
idy += 1
|
230
|
+
end
|
231
|
+
idx = idy
|
232
|
+
new_sentence += current_word
|
233
|
+
current_word = ""
|
234
|
+
current_white_space = ""
|
235
|
+
end
|
236
|
+
if idx + 1 >= sentence_len && current_hash.has_key?(_keyword)
|
237
|
+
sequence_found = current_hash[_keyword]
|
238
|
+
new_sentence += sequence_found
|
239
|
+
end
|
240
|
+
idx = idx + 1 # loop increment
|
241
|
+
end
|
242
|
+
return new_sentence
|
243
|
+
end
|
133
244
|
end
|
134
245
|
end
|
data/lib/flashtext/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: flashtext
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Imran
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2018-01-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|