flashtext 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: b9bc0b315226f3f55d68f38f6997ae8be4ad3377
4
- data.tar.gz: 9293854acfd5611be5a54c2257927bd53c8ac19f
3
+ metadata.gz: d8ca3508b7355433ef41fc40eead394cdc2149d2
4
+ data.tar.gz: 3715210544ee9170cf9c559ab10193b016e63615
5
5
  SHA512:
6
- metadata.gz: ae6751f2f37175f1271ad6f3aea385213522df7db12b977ae2e03f246d36e06cca88deb3e82ec952528a3110632f97ea41673656d4a4fa565a0e39fc6b40ece1
7
- data.tar.gz: 53badf293130a98770d67fc5b20d86bb0830607602559cf77a3943f40f2f1b9b907ca6f340cdfde74c6ce8fcad992a9b7f229e0b69eafc339684bae0cb355e41
6
+ metadata.gz: 2af1340ae2088f1327f689a0fc9c6036758f85571c3fbe6045b89593d2e5a45df8c5d2bcf705c51aa8fa69aad24504f71edd604454fd7cccfa485a31a605669f
7
+ data.tar.gz: 0daa3bc37c6820204950c02f6766d2e11ab3850ebfaca8abd75582aa0453a0a2f6df59d2bdf19bd83e4108597f48caec8cb7c224cfc4a5295a106a57c21ac735
@@ -0,0 +1,37 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ flashtext (0.1.0)
5
+
6
+ GEM
7
+ remote: https://rubygems.org/
8
+ specs:
9
+ diff-lcs (1.3)
10
+ json (2.1.0)
11
+ rake (10.5.0)
12
+ rspec (3.7.0)
13
+ rspec-core (~> 3.7.0)
14
+ rspec-expectations (~> 3.7.0)
15
+ rspec-mocks (~> 3.7.0)
16
+ rspec-core (3.7.0)
17
+ rspec-support (~> 3.7.0)
18
+ rspec-expectations (3.7.0)
19
+ diff-lcs (>= 1.2.0, < 2.0)
20
+ rspec-support (~> 3.7.0)
21
+ rspec-mocks (3.7.0)
22
+ diff-lcs (>= 1.2.0, < 2.0)
23
+ rspec-support (~> 3.7.0)
24
+ rspec-support (3.7.0)
25
+
26
+ PLATFORMS
27
+ ruby
28
+
29
+ DEPENDENCIES
30
+ bundler (~> 1.16)
31
+ flashtext!
32
+ json (~> 2.0)
33
+ rake (~> 10.0)
34
+ rspec (~> 3.0)
35
+
36
+ BUNDLED WITH
37
+ 1.16.0
data/README.md CHANGED
@@ -1,2 +1,35 @@
1
- # flashtext
2
- Ruby Gem for FlashText algorithm
1
+ # Flashtext Ruby Gem
2
+
3
+ ### Ruby (zero dependencies) gem for amazing Python package [flashtext](https://github.com/vi3k6i5/flashtext)
4
+
5
+ This module can be used to replace keywords in sentences or extract keywords from sentences. It is based on the [FlashText algorithm](https://arxiv.org/abs/1711.00046)
6
+
7
+ More about Flashtext algorithm can be found here.
8
+
9
+ The original paper published on [FlashText algorithm](https://arxiv.org/abs/1711.00046>)
10
+
11
+ The article published on [Medium freeCodeCamp](https://medium.freecodecamp.org/regex-was-taking-5-days-flashtext-does-it-in-15-minutes-55f04411025f)
12
+
13
+
14
+ Installation
15
+ ------------
16
+ $ gem install flashtext
17
+
18
+
19
+ API doc
20
+ -------
21
+
22
+ Documentation can be found at [FlashText Read the Docs](http://www.rubydoc.info/gems/flashtext/)
23
+
24
+ ## Usage
25
+
26
+ Contribute
27
+ ----------
28
+
29
+ - Issue Tracker: https://github.com/imran3180/flashtext/issues
30
+ - Source Code: https://github.com/imran3180/flashtext/issues
31
+
32
+
33
+ ## License
34
+
35
+ This code is under MIT license.
@@ -33,4 +33,5 @@ Gem::Specification.new do |spec|
33
33
  spec.add_development_dependency "bundler", "~> 1.16"
34
34
  spec.add_development_dependency "rake", "~> 10.0"
35
35
  spec.add_development_dependency "rspec", "~> 3.0"
36
+ spec.add_development_dependency "json", "~> 2.0"
36
37
  end
@@ -1,5 +1,4 @@
1
1
  require "flashtext/version"
2
-
2
+ require "flashtext/keyword_processor"
3
3
  module Flashtext
4
- # Your code goes here...
5
4
  end
@@ -0,0 +1,134 @@
1
+ module Flashtext
2
+ class KeywordProcessor
3
+ require 'set'
4
+
5
+ attr_accessor :_keyword, :_white_space_chars, :keyword_trie_hash, :case_sensitive, :word_boundaries
6
+
7
+ def initialize case_sensitive = false
8
+ self._keyword = '_keyword_'
9
+ self._white_space_chars = Set.new(['.', '\t', '\n', '\a', ' ', ','])
10
+ self.keyword_trie_hash = {}
11
+ self.case_sensitive = case_sensitive
12
+ self.word_boundaries = Set.new("0".upto("9").to_a + "A".upto("Z").to_a + "a".upto("z").to_a + ["_"])
13
+ end
14
+
15
+ def add_keyword keyword, clean_name = nil
16
+ if not clean_name and keyword
17
+ clean_name = keyword
18
+ end
19
+
20
+ if keyword and clean_name
21
+ keyword = keyword.downcase if not case_sensitive
22
+ current_hash = keyword_trie_hash
23
+ keyword.each_char do |char|
24
+ current_hash = if current_hash.has_key?(char)
25
+ current_hash[char]
26
+ else
27
+ current_hash[char] = {}
28
+ current_hash[char]
29
+ end
30
+ end
31
+ current_hash[_keyword] = clean_name
32
+ end
33
+ end
34
+
35
+ def add_keywords_from_hash keyword_hash
36
+ raise ArgumentError, "#{keyword_hash} is not hash. argument expected: Hash" unless keyword_hash.instance_of?(Hash)
37
+ keyword_hash.each do |clean_name, keywords|
38
+ raise ArgumentError, "#{keyword_hash['clean_name']} is not array. expected: Array" unless keywords.instance_of?(Array)
39
+ keywords.each do |keyword|
40
+ add_keyword(keyword.to_s, clean_name.to_s)
41
+ end
42
+ end
43
+ end
44
+
45
+ def extract_keywords sentence
46
+ keywords_extracted = []
47
+ keywords_extracted if not sentence #if sentence is empty or none just return empty list
48
+ sentence = sentence.downcase if not case_sensitive
49
+ current_hash = keyword_trie_hash
50
+
51
+ sequence_end_pos = 0
52
+ idx = 0
53
+ sentence_len = sentence.length
54
+
55
+ while idx < sentence_len
56
+ char = sentence[idx]
57
+ # when we reach a character that might denote word end
58
+ if not word_boundaries.member?(char)
59
+ # If end is present OR ?? (confused)
60
+ if current_hash.has_key?(_keyword) or current_hash.has_key?(char)
61
+ # Update longest sequence found
62
+ sequence_found = nil
63
+ longest_sequence_found = nil
64
+ is_longer_seq_found = false
65
+
66
+ if current_hash.has_key?(_keyword)
67
+ sequence_found = current_hash[_keyword]
68
+ longest_sequence_found = current_hash[_keyword]
69
+ sequence_end_pos = idx
70
+ end
71
+
72
+ # re look for longest_sequence from this position
73
+ if current_hash.has_key?(char)
74
+ current_hash_continued = current_hash[char]
75
+
76
+ idy = idx + 1
77
+ while idy < sentence_len
78
+ inner_char = sentence[idy]
79
+ if not word_boundaries.member?(inner_char) and current_hash_continued.has_key?(_keyword)
80
+ # update longest sequence found. This will keep updating longest_sequence if exists.
81
+ longest_sequence_found = current_hash_continued[_keyword]
82
+ sequence_end_pos = idy
83
+ is_longer_seq_found = true
84
+ end
85
+ if current_hash_continued.has_key?(inner_char)
86
+ current_hash_continued = current_hash_continued[inner_char]
87
+ else
88
+ break
89
+ end
90
+ idy = idy + 1
91
+ end
92
+ # checked for end of sentenance
93
+ if idy == sentence_len and current_hash_continued.has_key?(_keyword)
94
+ # Update longest sequence found
95
+ longest_sequence_found = current_hash_continued[_keyword]
96
+ sequence_end_pos = idy
97
+ is_longer_seq_found = true
98
+ end
99
+ idx = sequence_end_pos if is_longer_seq_found
100
+ end
101
+ current_hash = keyword_trie_hash # reset
102
+ if longest_sequence_found
103
+ keywords_extracted << longest_sequence_found
104
+ end
105
+ else
106
+ # reset current_hash
107
+ current_hash = keyword_trie_hash
108
+ end
109
+ elsif current_hash.has_key?(char)
110
+ # we can continue from this char
111
+ current_hash = current_hash[char]
112
+ else
113
+ # we reset current_hash
114
+ current_hash = keyword_trie_hash
115
+ # skip to end of keyword
116
+ while idx < sentence_len
117
+ char = sentence[idx]
118
+ break if not word_boundaries.member?(char)
119
+ idx = idx + 1
120
+ end
121
+ end
122
+ # if we are end of sentence and have a sequence discovered
123
+ if idx + 1 >= sentence_len
124
+ if current_hash.has_key?(_keyword)
125
+ sequence_found = current_hash[_keyword]
126
+ keywords_extracted << sequence_found
127
+ end
128
+ end
129
+ idx = idx + 1 # loop increment.
130
+ end
131
+ keywords_extracted
132
+ end
133
+ end
134
+ end
@@ -1,3 +1,3 @@
1
1
  module Flashtext
2
- VERSION = "0.1.0"
2
+ VERSION = "0.1.1"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: flashtext
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Imran
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-11-16 00:00:00.000000000 Z
11
+ date: 2017-11-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -52,6 +52,20 @@ dependencies:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
54
  version: '3.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: json
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '2.0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '2.0'
55
69
  description: Ruby implementation of FlashText Algorithm
56
70
  email:
57
71
  - imranjannatiitkgp@gmail.com
@@ -64,6 +78,7 @@ files:
64
78
  - ".travis.yml"
65
79
  - CODE_OF_CONDUCT.md
66
80
  - Gemfile
81
+ - Gemfile.lock
67
82
  - LICENSE.txt
68
83
  - README.md
69
84
  - Rakefile
@@ -71,6 +86,7 @@ files:
71
86
  - bin/setup
72
87
  - flashtext.gemspec
73
88
  - lib/flashtext.rb
89
+ - lib/flashtext/keyword_processor.rb
74
90
  - lib/flashtext/version.rb
75
91
  homepage: https://github.com/imran3180/flashtext
76
92
  licenses: