categorize 0.0.4 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +15 -0
- data/lib/categorize/models/abstract_model.rb +36 -0
- data/lib/categorize/utils/gram_collection.rb +47 -0
- data/lib/categorize/utils/gram_node.rb +16 -0
- data/lib/categorize/utils/grams.rb +46 -0
- metadata +8 -6
checksums.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
---
|
2
|
+
!binary "U0hBMQ==":
|
3
|
+
metadata.gz: !binary |-
|
4
|
+
OGNkMmQ5MzEwZGFlOWUxNWM0MzU0MTI0MTI2NzE5NTBlNGZjYzM3Ng==
|
5
|
+
data.tar.gz: !binary |-
|
6
|
+
YmNmMDE5NWMxYmZhNWI0ZDI2NDA3MjdkOTNjYmI2MGUzMWY0ZTVjZQ==
|
7
|
+
!binary "U0hBNTEy":
|
8
|
+
metadata.gz: !binary |-
|
9
|
+
ZTdiM2IyMzRiOTg1Y2Y5MDc2ZWQwY2EyYjA3YTZjODEzYmM5MTU5NWVlNzBl
|
10
|
+
ZDdmYzhiNzdiOTYxOGY3YzgzNWFmZDhmMmIxODczZmY1NGM2MmM2NzI5NzVi
|
11
|
+
NWYzMGMwOGI2MWI5Mjk5NmY4MmMwM2YyZWFjNzU1MGMxMjcwYWI=
|
12
|
+
data.tar.gz: !binary |-
|
13
|
+
MjQ1NWQ4ZGVlMzNjZDZkNDVmODViOTY1ZTM4ZGZlYjhjMGVmNDQ4ZGRiNmRm
|
14
|
+
MGY1OTNhN2NkMzQ3Y2U4OGIyMDc3MTU2ZTc5MTE0ZGE4NTc4ODg2MGE5MjRm
|
15
|
+
N2M3MWQ4YzJhYzFjNTNjZTNjNDA3ZjVlM2RmZDVkMTcxNTFkNDM=
|
@@ -0,0 +1,36 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Categorize
|
4
|
+
module Models
|
5
|
+
class AbstractModel
|
6
|
+
require 'ai4r'
|
7
|
+
|
8
|
+
def initialize
|
9
|
+
@gram_cache = Hash.new(nil)
|
10
|
+
@bigram_max_cache = Hash.new(nil)
|
11
|
+
end
|
12
|
+
|
13
|
+
def build_vars(records_to_tokens)
|
14
|
+
@tokens = records_to_tokens.values
|
15
|
+
@labels, @vectors = vectorize(@tokens)
|
16
|
+
build_dataset(@labels, @vectors)
|
17
|
+
end
|
18
|
+
|
19
|
+
def vectorize(token_groups)
|
20
|
+
labels = token_groups.flatten.uniq
|
21
|
+
vectors = token_groups.reduce([]) do |ary, tokens|
|
22
|
+
items = Array.new(labels.length, 0)
|
23
|
+
labels.each_with_index do |token, i|
|
24
|
+
items[i] = tokens.count(token)
|
25
|
+
end
|
26
|
+
ary << items
|
27
|
+
end
|
28
|
+
[labels, vectors]
|
29
|
+
end
|
30
|
+
|
31
|
+
def build_dataset(labels, vectors)
|
32
|
+
Ai4r::Data::DataSet.new(data_items: vectors, data_labels: labels)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Categorize
|
4
|
+
module Utils
|
5
|
+
class GramCollection
|
6
|
+
attr_reader :grams, :content_to_frequency, :content
|
7
|
+
attr_accessor :fitness
|
8
|
+
|
9
|
+
def initialize(content, words, invalid)
|
10
|
+
@fitness = {}
|
11
|
+
@content = content
|
12
|
+
@invalid = invalid
|
13
|
+
|
14
|
+
# TODO: n grammify this
|
15
|
+
last_word = nil
|
16
|
+
last_2nd_word = nil
|
17
|
+
|
18
|
+
@grams = {}
|
19
|
+
@content_to_frequency = words.reduce({}) do |hash, word|
|
20
|
+
bigram = trigram = nil
|
21
|
+
if last_word && last_word != word
|
22
|
+
bigram = "#{last_word} #{word}"
|
23
|
+
if last_2nd_word && word != last_2nd_word
|
24
|
+
trigram = "#{last_2nd_word} #{bigram}"
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
[word, bigram, trigram].compact.each do |gram|
|
29
|
+
next if @invalid.call(gram)
|
30
|
+
if hash[gram]
|
31
|
+
hash[gram] += 1
|
32
|
+
@grams[gram].frequency += 1
|
33
|
+
else
|
34
|
+
hash[gram] = 1
|
35
|
+
@grams[gram] = GramNode.new(self, gram, 1)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
last_2nd_word = last_word
|
39
|
+
last_word = word
|
40
|
+
hash
|
41
|
+
end
|
42
|
+
|
43
|
+
@grams = @grams.values
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Categorize
|
4
|
+
module Utils
|
5
|
+
class GramNode
|
6
|
+
attr_reader :content, :gram_collection
|
7
|
+
attr_accessor :frequency
|
8
|
+
|
9
|
+
def initialize(gram_collection, content, frequency = 0)
|
10
|
+
@gram_group = gram_collection
|
11
|
+
@content = content
|
12
|
+
@frequency = frequency
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Categorize
|
4
|
+
module Utils
|
5
|
+
module Grams
|
6
|
+
def create_grams(query, records_to_words)
|
7
|
+
all_grams = []
|
8
|
+
@query = query
|
9
|
+
@query_terms = query.split.map(&:downcase).map(&:strip)
|
10
|
+
@query_alt = "#{@query_terms[1..-1]} #{@query_terms[0]}"
|
11
|
+
|
12
|
+
invalid = Proc.new do |gram, *args|
|
13
|
+
# remove [[gram]] if == [[query]]
|
14
|
+
gram == @query || gram == @query_alt || @query_terms.include?(gram)
|
15
|
+
end
|
16
|
+
|
17
|
+
gram_collections = records_to_words.map do |record, words|
|
18
|
+
gram_collection = GramCollection.new(record, words, invalid)
|
19
|
+
all_grams += gram_collection.grams
|
20
|
+
gram_collection
|
21
|
+
end
|
22
|
+
return gram_collections, make_grams_unique(all_grams)
|
23
|
+
end
|
24
|
+
|
25
|
+
def check_plurals(frequent_grams)
|
26
|
+
# if exists [[gram]] and [[gram]]s then remove [[gram]]s
|
27
|
+
frequent_grams_contents = frequent_grams.map(&:content)
|
28
|
+
frequent_grams.delete_if do |gram|
|
29
|
+
gram.content[-1] == 's' and
|
30
|
+
frequent_grams_contents.include?(gram.content[0...-1])
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def make_grams_unique(grams)
|
35
|
+
grams.reduce({}) do |hash, gram|
|
36
|
+
if hash[gram.content]
|
37
|
+
hash[gram.content].frequency += gram.frequency
|
38
|
+
else
|
39
|
+
hash[gram.content] = gram
|
40
|
+
end
|
41
|
+
hash
|
42
|
+
end.values
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
metadata
CHANGED
@@ -1,8 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: categorize
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
5
|
-
prerelease:
|
4
|
+
version: 0.0.5
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
7
|
- Peter Lubell-Doughtie
|
@@ -22,31 +21,34 @@ files:
|
|
22
21
|
- lib/categorize.rb
|
23
22
|
- lib/categorize/model.rb
|
24
23
|
- lib/categorize/constants.rb
|
24
|
+
- lib/categorize/models/abstract_model.rb
|
25
25
|
- lib/categorize/models/bag_of_words.rb
|
26
26
|
- lib/categorize/models/cluster.rb
|
27
27
|
- lib/categorize/models/hierarchical_cluster.rb
|
28
|
+
- lib/categorize/utils/gram_collection.rb
|
29
|
+
- lib/categorize/utils/gram_node.rb
|
30
|
+
- lib/categorize/utils/grams.rb
|
28
31
|
homepage: http://www.helioid.com/
|
29
32
|
licenses: []
|
33
|
+
metadata: {}
|
30
34
|
post_install_message:
|
31
35
|
rdoc_options: []
|
32
36
|
require_paths:
|
33
37
|
- lib
|
34
38
|
required_ruby_version: !ruby/object:Gem::Requirement
|
35
|
-
none: false
|
36
39
|
requirements:
|
37
40
|
- - ! '>='
|
38
41
|
- !ruby/object:Gem::Version
|
39
42
|
version: '0'
|
40
43
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
41
|
-
none: false
|
42
44
|
requirements:
|
43
45
|
- - ! '>='
|
44
46
|
- !ruby/object:Gem::Version
|
45
47
|
version: '0'
|
46
48
|
requirements: []
|
47
49
|
rubyforge_project:
|
48
|
-
rubygems_version:
|
50
|
+
rubygems_version: 2.0.3
|
49
51
|
signing_key:
|
50
|
-
specification_version:
|
52
|
+
specification_version: 4
|
51
53
|
summary: A text categorization library.
|
52
54
|
test_files: []
|