categorize 0.0.4 → 0.0.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +15 -0
- data/lib/categorize/models/abstract_model.rb +36 -0
- data/lib/categorize/utils/gram_collection.rb +47 -0
- data/lib/categorize/utils/gram_node.rb +16 -0
- data/lib/categorize/utils/grams.rb +46 -0
- metadata +8 -6
checksums.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
---
|
2
|
+
!binary "U0hBMQ==":
|
3
|
+
metadata.gz: !binary |-
|
4
|
+
OGNkMmQ5MzEwZGFlOWUxNWM0MzU0MTI0MTI2NzE5NTBlNGZjYzM3Ng==
|
5
|
+
data.tar.gz: !binary |-
|
6
|
+
YmNmMDE5NWMxYmZhNWI0ZDI2NDA3MjdkOTNjYmI2MGUzMWY0ZTVjZQ==
|
7
|
+
!binary "U0hBNTEy":
|
8
|
+
metadata.gz: !binary |-
|
9
|
+
ZTdiM2IyMzRiOTg1Y2Y5MDc2ZWQwY2EyYjA3YTZjODEzYmM5MTU5NWVlNzBl
|
10
|
+
ZDdmYzhiNzdiOTYxOGY3YzgzNWFmZDhmMmIxODczZmY1NGM2MmM2NzI5NzVi
|
11
|
+
NWYzMGMwOGI2MWI5Mjk5NmY4MmMwM2YyZWFjNzU1MGMxMjcwYWI=
|
12
|
+
data.tar.gz: !binary |-
|
13
|
+
MjQ1NWQ4ZGVlMzNjZDZkNDVmODViOTY1ZTM4ZGZlYjhjMGVmNDQ4ZGRiNmRm
|
14
|
+
MGY1OTNhN2NkMzQ3Y2U4OGIyMDc3MTU2ZTc5MTE0ZGE4NTc4ODg2MGE5MjRm
|
15
|
+
N2M3MWQ4YzJhYzFjNTNjZTNjNDA3ZjVlM2RmZDVkMTcxNTFkNDM=
|
@@ -0,0 +1,36 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Categorize
|
4
|
+
module Models
|
5
|
+
class AbstractModel
|
6
|
+
require 'ai4r'
|
7
|
+
|
8
|
+
def initialize
|
9
|
+
@gram_cache = Hash.new(nil)
|
10
|
+
@bigram_max_cache = Hash.new(nil)
|
11
|
+
end
|
12
|
+
|
13
|
+
def build_vars(records_to_tokens)
|
14
|
+
@tokens = records_to_tokens.values
|
15
|
+
@labels, @vectors = vectorize(@tokens)
|
16
|
+
build_dataset(@labels, @vectors)
|
17
|
+
end
|
18
|
+
|
19
|
+
def vectorize(token_groups)
|
20
|
+
labels = token_groups.flatten.uniq
|
21
|
+
vectors = token_groups.reduce([]) do |ary, tokens|
|
22
|
+
items = Array.new(labels.length, 0)
|
23
|
+
labels.each_with_index do |token, i|
|
24
|
+
items[i] = tokens.count(token)
|
25
|
+
end
|
26
|
+
ary << items
|
27
|
+
end
|
28
|
+
[labels, vectors]
|
29
|
+
end
|
30
|
+
|
31
|
+
def build_dataset(labels, vectors)
|
32
|
+
Ai4r::Data::DataSet.new(data_items: vectors, data_labels: labels)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Categorize
|
4
|
+
module Utils
|
5
|
+
class GramCollection
|
6
|
+
attr_reader :grams, :content_to_frequency, :content
|
7
|
+
attr_accessor :fitness
|
8
|
+
|
9
|
+
def initialize(content, words, invalid)
|
10
|
+
@fitness = {}
|
11
|
+
@content = content
|
12
|
+
@invalid = invalid
|
13
|
+
|
14
|
+
# TODO: n grammify this
|
15
|
+
last_word = nil
|
16
|
+
last_2nd_word = nil
|
17
|
+
|
18
|
+
@grams = {}
|
19
|
+
@content_to_frequency = words.reduce({}) do |hash, word|
|
20
|
+
bigram = trigram = nil
|
21
|
+
if last_word && last_word != word
|
22
|
+
bigram = "#{last_word} #{word}"
|
23
|
+
if last_2nd_word && word != last_2nd_word
|
24
|
+
trigram = "#{last_2nd_word} #{bigram}"
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
[word, bigram, trigram].compact.each do |gram|
|
29
|
+
next if @invalid.call(gram)
|
30
|
+
if hash[gram]
|
31
|
+
hash[gram] += 1
|
32
|
+
@grams[gram].frequency += 1
|
33
|
+
else
|
34
|
+
hash[gram] = 1
|
35
|
+
@grams[gram] = GramNode.new(self, gram, 1)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
last_2nd_word = last_word
|
39
|
+
last_word = word
|
40
|
+
hash
|
41
|
+
end
|
42
|
+
|
43
|
+
@grams = @grams.values
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Categorize
|
4
|
+
module Utils
|
5
|
+
class GramNode
|
6
|
+
attr_reader :content, :gram_collection
|
7
|
+
attr_accessor :frequency
|
8
|
+
|
9
|
+
def initialize(gram_collection, content, frequency = 0)
|
10
|
+
@gram_group = gram_collection
|
11
|
+
@content = content
|
12
|
+
@frequency = frequency
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Categorize
|
4
|
+
module Utils
|
5
|
+
module Grams
|
6
|
+
def create_grams(query, records_to_words)
|
7
|
+
all_grams = []
|
8
|
+
@query = query
|
9
|
+
@query_terms = query.split.map(&:downcase).map(&:strip)
|
10
|
+
@query_alt = "#{@query_terms[1..-1]} #{@query_terms[0]}"
|
11
|
+
|
12
|
+
invalid = Proc.new do |gram, *args|
|
13
|
+
# remove [[gram]] if == [[query]]
|
14
|
+
gram == @query || gram == @query_alt || @query_terms.include?(gram)
|
15
|
+
end
|
16
|
+
|
17
|
+
gram_collections = records_to_words.map do |record, words|
|
18
|
+
gram_collection = GramCollection.new(record, words, invalid)
|
19
|
+
all_grams += gram_collection.grams
|
20
|
+
gram_collection
|
21
|
+
end
|
22
|
+
return gram_collections, make_grams_unique(all_grams)
|
23
|
+
end
|
24
|
+
|
25
|
+
def check_plurals(frequent_grams)
|
26
|
+
# if exists [[gram]] and [[gram]]s then remove [[gram]]s
|
27
|
+
frequent_grams_contents = frequent_grams.map(&:content)
|
28
|
+
frequent_grams.delete_if do |gram|
|
29
|
+
gram.content[-1] == 's' and
|
30
|
+
frequent_grams_contents.include?(gram.content[0...-1])
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def make_grams_unique(grams)
|
35
|
+
grams.reduce({}) do |hash, gram|
|
36
|
+
if hash[gram.content]
|
37
|
+
hash[gram.content].frequency += gram.frequency
|
38
|
+
else
|
39
|
+
hash[gram.content] = gram
|
40
|
+
end
|
41
|
+
hash
|
42
|
+
end.values
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
metadata
CHANGED
@@ -1,8 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: categorize
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
5
|
-
prerelease:
|
4
|
+
version: 0.0.5
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
7
|
- Peter Lubell-Doughtie
|
@@ -22,31 +21,34 @@ files:
|
|
22
21
|
- lib/categorize.rb
|
23
22
|
- lib/categorize/model.rb
|
24
23
|
- lib/categorize/constants.rb
|
24
|
+
- lib/categorize/models/abstract_model.rb
|
25
25
|
- lib/categorize/models/bag_of_words.rb
|
26
26
|
- lib/categorize/models/cluster.rb
|
27
27
|
- lib/categorize/models/hierarchical_cluster.rb
|
28
|
+
- lib/categorize/utils/gram_collection.rb
|
29
|
+
- lib/categorize/utils/gram_node.rb
|
30
|
+
- lib/categorize/utils/grams.rb
|
28
31
|
homepage: http://www.helioid.com/
|
29
32
|
licenses: []
|
33
|
+
metadata: {}
|
30
34
|
post_install_message:
|
31
35
|
rdoc_options: []
|
32
36
|
require_paths:
|
33
37
|
- lib
|
34
38
|
required_ruby_version: !ruby/object:Gem::Requirement
|
35
|
-
none: false
|
36
39
|
requirements:
|
37
40
|
- - ! '>='
|
38
41
|
- !ruby/object:Gem::Version
|
39
42
|
version: '0'
|
40
43
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
41
|
-
none: false
|
42
44
|
requirements:
|
43
45
|
- - ! '>='
|
44
46
|
- !ruby/object:Gem::Version
|
45
47
|
version: '0'
|
46
48
|
requirements: []
|
47
49
|
rubyforge_project:
|
48
|
-
rubygems_version:
|
50
|
+
rubygems_version: 2.0.3
|
49
51
|
signing_key:
|
50
|
-
specification_version:
|
52
|
+
specification_version: 4
|
51
53
|
summary: A text categorization library.
|
52
54
|
test_files: []
|