flashtext 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: b9bc0b315226f3f55d68f38f6997ae8be4ad3377
4
- data.tar.gz: 9293854acfd5611be5a54c2257927bd53c8ac19f
3
+ metadata.gz: d8ca3508b7355433ef41fc40eead394cdc2149d2
4
+ data.tar.gz: 3715210544ee9170cf9c559ab10193b016e63615
5
5
  SHA512:
6
- metadata.gz: ae6751f2f37175f1271ad6f3aea385213522df7db12b977ae2e03f246d36e06cca88deb3e82ec952528a3110632f97ea41673656d4a4fa565a0e39fc6b40ece1
7
- data.tar.gz: 53badf293130a98770d67fc5b20d86bb0830607602559cf77a3943f40f2f1b9b907ca6f340cdfde74c6ce8fcad992a9b7f229e0b69eafc339684bae0cb355e41
6
+ metadata.gz: 2af1340ae2088f1327f689a0fc9c6036758f85571c3fbe6045b89593d2e5a45df8c5d2bcf705c51aa8fa69aad24504f71edd604454fd7cccfa485a31a605669f
7
+ data.tar.gz: 0daa3bc37c6820204950c02f6766d2e11ab3850ebfaca8abd75582aa0453a0a2f6df59d2bdf19bd83e4108597f48caec8cb7c224cfc4a5295a106a57c21ac735
@@ -0,0 +1,37 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ flashtext (0.1.0)
5
+
6
+ GEM
7
+ remote: https://rubygems.org/
8
+ specs:
9
+ diff-lcs (1.3)
10
+ json (2.1.0)
11
+ rake (10.5.0)
12
+ rspec (3.7.0)
13
+ rspec-core (~> 3.7.0)
14
+ rspec-expectations (~> 3.7.0)
15
+ rspec-mocks (~> 3.7.0)
16
+ rspec-core (3.7.0)
17
+ rspec-support (~> 3.7.0)
18
+ rspec-expectations (3.7.0)
19
+ diff-lcs (>= 1.2.0, < 2.0)
20
+ rspec-support (~> 3.7.0)
21
+ rspec-mocks (3.7.0)
22
+ diff-lcs (>= 1.2.0, < 2.0)
23
+ rspec-support (~> 3.7.0)
24
+ rspec-support (3.7.0)
25
+
26
+ PLATFORMS
27
+ ruby
28
+
29
+ DEPENDENCIES
30
+ bundler (~> 1.16)
31
+ flashtext!
32
+ json (~> 2.0)
33
+ rake (~> 10.0)
34
+ rspec (~> 3.0)
35
+
36
+ BUNDLED WITH
37
+ 1.16.0
data/README.md CHANGED
@@ -1,2 +1,35 @@
1
- # flashtext
2
- Ruby Gem for FlashText algorithm
1
+ # Flashtext Ruby Gem
2
+
3
+ ### Ruby (zero dependencies) gem for amazing Python package [flashtext](https://github.com/vi3k6i5/flashtext)
4
+
5
+ This module can be used to replace keywords in sentences or extract keywords from sentences. It is based on the [FlashText algorithm](https://arxiv.org/abs/1711.00046)
6
+
7
+ More about Flashtext algorithm can be found here.
8
+
9
+ The original paper published on [FlashText algorithm](https://arxiv.org/abs/1711.00046>)
10
+
11
+ The article published on [Medium freeCodeCamp](https://medium.freecodecamp.org/regex-was-taking-5-days-flashtext-does-it-in-15-minutes-55f04411025f)
12
+
13
+
14
+ Installation
15
+ ------------
16
+ $ gem install flashtext
17
+
18
+
19
+ API doc
20
+ -------
21
+
22
+ Documentation can be found at [FlashText Read the Docs](http://www.rubydoc.info/gems/flashtext/)
23
+
24
+ ## Usage
25
+
26
+ Contribute
27
+ ----------
28
+
29
+ - Issue Tracker: https://github.com/imran3180/flashtext/issues
30
+ - Source Code: https://github.com/imran3180/flashtext/issues
31
+
32
+
33
+ ## License
34
+
35
+ This code is under MIT license.
@@ -33,4 +33,5 @@ Gem::Specification.new do |spec|
33
33
  spec.add_development_dependency "bundler", "~> 1.16"
34
34
  spec.add_development_dependency "rake", "~> 10.0"
35
35
  spec.add_development_dependency "rspec", "~> 3.0"
36
+ spec.add_development_dependency "json", "~> 2.0"
36
37
  end
@@ -1,5 +1,4 @@
1
1
  require "flashtext/version"
2
-
2
+ require "flashtext/keyword_processor"
3
3
  module Flashtext
4
- # Your code goes here...
5
4
  end
@@ -0,0 +1,134 @@
1
+ module Flashtext
2
+ class KeywordProcessor
3
+ require 'set'
4
+
5
+ attr_accessor :_keyword, :_white_space_chars, :keyword_trie_hash, :case_sensitive, :word_boundaries
6
+
7
+ def initialize case_sensitive = false
8
+ self._keyword = '_keyword_'
9
+ self._white_space_chars = Set.new(['.', '\t', '\n', '\a', ' ', ','])
10
+ self.keyword_trie_hash = {}
11
+ self.case_sensitive = case_sensitive
12
+ self.word_boundaries = Set.new("0".upto("9").to_a + "A".upto("Z").to_a + "a".upto("z").to_a + ["_"])
13
+ end
14
+
15
+ def add_keyword keyword, clean_name = nil
16
+ if not clean_name and keyword
17
+ clean_name = keyword
18
+ end
19
+
20
+ if keyword and clean_name
21
+ keyword = keyword.downcase if not case_sensitive
22
+ current_hash = keyword_trie_hash
23
+ keyword.each_char do |char|
24
+ current_hash = if current_hash.has_key?(char)
25
+ current_hash[char]
26
+ else
27
+ current_hash[char] = {}
28
+ current_hash[char]
29
+ end
30
+ end
31
+ current_hash[_keyword] = clean_name
32
+ end
33
+ end
34
+
35
+ def add_keywords_from_hash keyword_hash
36
+ raise ArgumentError, "#{keyword_hash} is not hash. argument expected: Hash" unless keyword_hash.instance_of?(Hash)
37
+ keyword_hash.each do |clean_name, keywords|
38
+ raise ArgumentError, "#{keyword_hash['clean_name']} is not array. expected: Array" unless keywords.instance_of?(Array)
39
+ keywords.each do |keyword|
40
+ add_keyword(keyword.to_s, clean_name.to_s)
41
+ end
42
+ end
43
+ end
44
+
45
+ def extract_keywords sentence
46
+ keywords_extracted = []
47
+ keywords_extracted if not sentence #if sentence is empty or none just return empty list
48
+ sentence = sentence.downcase if not case_sensitive
49
+ current_hash = keyword_trie_hash
50
+
51
+ sequence_end_pos = 0
52
+ idx = 0
53
+ sentence_len = sentence.length
54
+
55
+ while idx < sentence_len
56
+ char = sentence[idx]
57
+ # when we reach a character that might denote word end
58
+ if not word_boundaries.member?(char)
59
+ # If end is present OR ?? (confused)
60
+ if current_hash.has_key?(_keyword) or current_hash.has_key?(char)
61
+ # Update longest sequence found
62
+ sequence_found = nil
63
+ longest_sequence_found = nil
64
+ is_longer_seq_found = false
65
+
66
+ if current_hash.has_key?(_keyword)
67
+ sequence_found = current_hash[_keyword]
68
+ longest_sequence_found = current_hash[_keyword]
69
+ sequence_end_pos = idx
70
+ end
71
+
72
+ # re look for longest_sequence from this position
73
+ if current_hash.has_key?(char)
74
+ current_hash_continued = current_hash[char]
75
+
76
+ idy = idx + 1
77
+ while idy < sentence_len
78
+ inner_char = sentence[idy]
79
+ if not word_boundaries.member?(inner_char) and current_hash_continued.has_key?(_keyword)
80
+ # update longest sequence found. This will keep updating longest_sequence if exists.
81
+ longest_sequence_found = current_hash_continued[_keyword]
82
+ sequence_end_pos = idy
83
+ is_longer_seq_found = true
84
+ end
85
+ if current_hash_continued.has_key?(inner_char)
86
+ current_hash_continued = current_hash_continued[inner_char]
87
+ else
88
+ break
89
+ end
90
+ idy = idy + 1
91
+ end
92
+ # checked for end of sentenance
93
+ if idy == sentence_len and current_hash_continued.has_key?(_keyword)
94
+ # Update longest sequence found
95
+ longest_sequence_found = current_hash_continued[_keyword]
96
+ sequence_end_pos = idy
97
+ is_longer_seq_found = true
98
+ end
99
+ idx = sequence_end_pos if is_longer_seq_found
100
+ end
101
+ current_hash = keyword_trie_hash # reset
102
+ if longest_sequence_found
103
+ keywords_extracted << longest_sequence_found
104
+ end
105
+ else
106
+ # reset current_hash
107
+ current_hash = keyword_trie_hash
108
+ end
109
+ elsif current_hash.has_key?(char)
110
+ # we can continue from this char
111
+ current_hash = current_hash[char]
112
+ else
113
+ # we reset current_hash
114
+ current_hash = keyword_trie_hash
115
+ # skip to end of keyword
116
+ while idx < sentence_len
117
+ char = sentence[idx]
118
+ break if not word_boundaries.member?(char)
119
+ idx = idx + 1
120
+ end
121
+ end
122
+ # if we are end of sentence and have a sequence discovered
123
+ if idx + 1 >= sentence_len
124
+ if current_hash.has_key?(_keyword)
125
+ sequence_found = current_hash[_keyword]
126
+ keywords_extracted << sequence_found
127
+ end
128
+ end
129
+ idx = idx + 1 # loop increment.
130
+ end
131
+ keywords_extracted
132
+ end
133
+ end
134
+ end
@@ -1,3 +1,3 @@
1
1
  module Flashtext
2
- VERSION = "0.1.0"
2
+ VERSION = "0.1.1"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: flashtext
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Imran
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-11-16 00:00:00.000000000 Z
11
+ date: 2017-11-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -52,6 +52,20 @@ dependencies:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
54
  version: '3.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: json
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '2.0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '2.0'
55
69
  description: Ruby implementation of FlashText Algorithm
56
70
  email:
57
71
  - imranjannatiitkgp@gmail.com
@@ -64,6 +78,7 @@ files:
64
78
  - ".travis.yml"
65
79
  - CODE_OF_CONDUCT.md
66
80
  - Gemfile
81
+ - Gemfile.lock
67
82
  - LICENSE.txt
68
83
  - README.md
69
84
  - Rakefile
@@ -71,6 +86,7 @@ files:
71
86
  - bin/setup
72
87
  - flashtext.gemspec
73
88
  - lib/flashtext.rb
89
+ - lib/flashtext/keyword_processor.rb
74
90
  - lib/flashtext/version.rb
75
91
  homepage: https://github.com/imran3180/flashtext
76
92
  licenses: