flashtext 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +37 -0
- data/README.md +35 -2
- data/flashtext.gemspec +1 -0
- data/lib/flashtext.rb +1 -2
- data/lib/flashtext/keyword_processor.rb +134 -0
- data/lib/flashtext/version.rb +1 -1
- metadata +18 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d8ca3508b7355433ef41fc40eead394cdc2149d2
|
4
|
+
data.tar.gz: 3715210544ee9170cf9c559ab10193b016e63615
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2af1340ae2088f1327f689a0fc9c6036758f85571c3fbe6045b89593d2e5a45df8c5d2bcf705c51aa8fa69aad24504f71edd604454fd7cccfa485a31a605669f
|
7
|
+
data.tar.gz: 0daa3bc37c6820204950c02f6766d2e11ab3850ebfaca8abd75582aa0453a0a2f6df59d2bdf19bd83e4108597f48caec8cb7c224cfc4a5295a106a57c21ac735
|
data/Gemfile.lock
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
flashtext (0.1.0)
|
5
|
+
|
6
|
+
GEM
|
7
|
+
remote: https://rubygems.org/
|
8
|
+
specs:
|
9
|
+
diff-lcs (1.3)
|
10
|
+
json (2.1.0)
|
11
|
+
rake (10.5.0)
|
12
|
+
rspec (3.7.0)
|
13
|
+
rspec-core (~> 3.7.0)
|
14
|
+
rspec-expectations (~> 3.7.0)
|
15
|
+
rspec-mocks (~> 3.7.0)
|
16
|
+
rspec-core (3.7.0)
|
17
|
+
rspec-support (~> 3.7.0)
|
18
|
+
rspec-expectations (3.7.0)
|
19
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
20
|
+
rspec-support (~> 3.7.0)
|
21
|
+
rspec-mocks (3.7.0)
|
22
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
23
|
+
rspec-support (~> 3.7.0)
|
24
|
+
rspec-support (3.7.0)
|
25
|
+
|
26
|
+
PLATFORMS
|
27
|
+
ruby
|
28
|
+
|
29
|
+
DEPENDENCIES
|
30
|
+
bundler (~> 1.16)
|
31
|
+
flashtext!
|
32
|
+
json (~> 2.0)
|
33
|
+
rake (~> 10.0)
|
34
|
+
rspec (~> 3.0)
|
35
|
+
|
36
|
+
BUNDLED WITH
|
37
|
+
1.16.0
|
data/README.md
CHANGED
@@ -1,2 +1,35 @@
|
|
1
|
-
#
|
2
|
-
|
1
|
+
# Flashtext Ruby Gem
|
2
|
+
|
3
|
+
### Ruby (zero dependencies) gem for amazing Python package [flashtext](https://github.com/vi3k6i5/flashtext)
|
4
|
+
|
5
|
+
This module can be used to replace keywords in sentences or extract keywords from sentences. It is based on the [FlashText algorithm](https://arxiv.org/abs/1711.00046)
|
6
|
+
|
7
|
+
More about Flashtext algorithm can be found here.
|
8
|
+
|
9
|
+
The original paper published on [FlashText algorithm](https://arxiv.org/abs/1711.00046>)
|
10
|
+
|
11
|
+
The article published on [Medium freeCodeCamp](https://medium.freecodecamp.org/regex-was-taking-5-days-flashtext-does-it-in-15-minutes-55f04411025f)
|
12
|
+
|
13
|
+
|
14
|
+
Installation
|
15
|
+
------------
|
16
|
+
$ gem install flashtext
|
17
|
+
|
18
|
+
|
19
|
+
API doc
|
20
|
+
-------
|
21
|
+
|
22
|
+
Documentation can be found at [FlashText Read the Docs](http://www.rubydoc.info/gems/flashtext/)
|
23
|
+
|
24
|
+
## Usage
|
25
|
+
|
26
|
+
Contribute
|
27
|
+
----------
|
28
|
+
|
29
|
+
- Issue Tracker: https://github.com/imran3180/flashtext/issues
|
30
|
+
- Source Code: https://github.com/imran3180/flashtext/issues
|
31
|
+
|
32
|
+
|
33
|
+
## License
|
34
|
+
|
35
|
+
This code is under MIT license.
|
data/flashtext.gemspec
CHANGED
data/lib/flashtext.rb
CHANGED
@@ -0,0 +1,134 @@
|
|
1
|
+
module Flashtext
|
2
|
+
class KeywordProcessor
|
3
|
+
require 'set'
|
4
|
+
|
5
|
+
attr_accessor :_keyword, :_white_space_chars, :keyword_trie_hash, :case_sensitive, :word_boundaries
|
6
|
+
|
7
|
+
def initialize case_sensitive = false
|
8
|
+
self._keyword = '_keyword_'
|
9
|
+
self._white_space_chars = Set.new(['.', '\t', '\n', '\a', ' ', ','])
|
10
|
+
self.keyword_trie_hash = {}
|
11
|
+
self.case_sensitive = case_sensitive
|
12
|
+
self.word_boundaries = Set.new("0".upto("9").to_a + "A".upto("Z").to_a + "a".upto("z").to_a + ["_"])
|
13
|
+
end
|
14
|
+
|
15
|
+
def add_keyword keyword, clean_name = nil
|
16
|
+
if not clean_name and keyword
|
17
|
+
clean_name = keyword
|
18
|
+
end
|
19
|
+
|
20
|
+
if keyword and clean_name
|
21
|
+
keyword = keyword.downcase if not case_sensitive
|
22
|
+
current_hash = keyword_trie_hash
|
23
|
+
keyword.each_char do |char|
|
24
|
+
current_hash = if current_hash.has_key?(char)
|
25
|
+
current_hash[char]
|
26
|
+
else
|
27
|
+
current_hash[char] = {}
|
28
|
+
current_hash[char]
|
29
|
+
end
|
30
|
+
end
|
31
|
+
current_hash[_keyword] = clean_name
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def add_keywords_from_hash keyword_hash
|
36
|
+
raise ArgumentError, "#{keyword_hash} is not hash. argument expected: Hash" unless keyword_hash.instance_of?(Hash)
|
37
|
+
keyword_hash.each do |clean_name, keywords|
|
38
|
+
raise ArgumentError, "#{keyword_hash['clean_name']} is not array. expected: Array" unless keywords.instance_of?(Array)
|
39
|
+
keywords.each do |keyword|
|
40
|
+
add_keyword(keyword.to_s, clean_name.to_s)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def extract_keywords sentence
|
46
|
+
keywords_extracted = []
|
47
|
+
keywords_extracted if not sentence #if sentence is empty or none just return empty list
|
48
|
+
sentence = sentence.downcase if not case_sensitive
|
49
|
+
current_hash = keyword_trie_hash
|
50
|
+
|
51
|
+
sequence_end_pos = 0
|
52
|
+
idx = 0
|
53
|
+
sentence_len = sentence.length
|
54
|
+
|
55
|
+
while idx < sentence_len
|
56
|
+
char = sentence[idx]
|
57
|
+
# when we reach a character that might denote word end
|
58
|
+
if not word_boundaries.member?(char)
|
59
|
+
# If end is present OR ?? (confused)
|
60
|
+
if current_hash.has_key?(_keyword) or current_hash.has_key?(char)
|
61
|
+
# Update longest sequence found
|
62
|
+
sequence_found = nil
|
63
|
+
longest_sequence_found = nil
|
64
|
+
is_longer_seq_found = false
|
65
|
+
|
66
|
+
if current_hash.has_key?(_keyword)
|
67
|
+
sequence_found = current_hash[_keyword]
|
68
|
+
longest_sequence_found = current_hash[_keyword]
|
69
|
+
sequence_end_pos = idx
|
70
|
+
end
|
71
|
+
|
72
|
+
# re look for longest_sequence from this position
|
73
|
+
if current_hash.has_key?(char)
|
74
|
+
current_hash_continued = current_hash[char]
|
75
|
+
|
76
|
+
idy = idx + 1
|
77
|
+
while idy < sentence_len
|
78
|
+
inner_char = sentence[idy]
|
79
|
+
if not word_boundaries.member?(inner_char) and current_hash_continued.has_key?(_keyword)
|
80
|
+
# update longest sequence found. This will keep updating longest_sequence if exists.
|
81
|
+
longest_sequence_found = current_hash_continued[_keyword]
|
82
|
+
sequence_end_pos = idy
|
83
|
+
is_longer_seq_found = true
|
84
|
+
end
|
85
|
+
if current_hash_continued.has_key?(inner_char)
|
86
|
+
current_hash_continued = current_hash_continued[inner_char]
|
87
|
+
else
|
88
|
+
break
|
89
|
+
end
|
90
|
+
idy = idy + 1
|
91
|
+
end
|
92
|
+
# checked for end of sentenance
|
93
|
+
if idy == sentence_len and current_hash_continued.has_key?(_keyword)
|
94
|
+
# Update longest sequence found
|
95
|
+
longest_sequence_found = current_hash_continued[_keyword]
|
96
|
+
sequence_end_pos = idy
|
97
|
+
is_longer_seq_found = true
|
98
|
+
end
|
99
|
+
idx = sequence_end_pos if is_longer_seq_found
|
100
|
+
end
|
101
|
+
current_hash = keyword_trie_hash # reset
|
102
|
+
if longest_sequence_found
|
103
|
+
keywords_extracted << longest_sequence_found
|
104
|
+
end
|
105
|
+
else
|
106
|
+
# reset current_hash
|
107
|
+
current_hash = keyword_trie_hash
|
108
|
+
end
|
109
|
+
elsif current_hash.has_key?(char)
|
110
|
+
# we can continue from this char
|
111
|
+
current_hash = current_hash[char]
|
112
|
+
else
|
113
|
+
# we reset current_hash
|
114
|
+
current_hash = keyword_trie_hash
|
115
|
+
# skip to end of keyword
|
116
|
+
while idx < sentence_len
|
117
|
+
char = sentence[idx]
|
118
|
+
break if not word_boundaries.member?(char)
|
119
|
+
idx = idx + 1
|
120
|
+
end
|
121
|
+
end
|
122
|
+
# if we are end of sentence and have a sequence discovered
|
123
|
+
if idx + 1 >= sentence_len
|
124
|
+
if current_hash.has_key?(_keyword)
|
125
|
+
sequence_found = current_hash[_keyword]
|
126
|
+
keywords_extracted << sequence_found
|
127
|
+
end
|
128
|
+
end
|
129
|
+
idx = idx + 1 # loop increment.
|
130
|
+
end
|
131
|
+
keywords_extracted
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|
data/lib/flashtext/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: flashtext
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Imran
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-11-
|
11
|
+
date: 2017-11-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -52,6 +52,20 @@ dependencies:
|
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '3.0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: json
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '2.0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '2.0'
|
55
69
|
description: Ruby implementation of FlashText Algorithm
|
56
70
|
email:
|
57
71
|
- imranjannatiitkgp@gmail.com
|
@@ -64,6 +78,7 @@ files:
|
|
64
78
|
- ".travis.yml"
|
65
79
|
- CODE_OF_CONDUCT.md
|
66
80
|
- Gemfile
|
81
|
+
- Gemfile.lock
|
67
82
|
- LICENSE.txt
|
68
83
|
- README.md
|
69
84
|
- Rakefile
|
@@ -71,6 +86,7 @@ files:
|
|
71
86
|
- bin/setup
|
72
87
|
- flashtext.gemspec
|
73
88
|
- lib/flashtext.rb
|
89
|
+
- lib/flashtext/keyword_processor.rb
|
74
90
|
- lib/flashtext/version.rb
|
75
91
|
homepage: https://github.com/imran3180/flashtext
|
76
92
|
licenses:
|