categorize 0.0.4 → 0.0.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,15 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ OGNkMmQ5MzEwZGFlOWUxNWM0MzU0MTI0MTI2NzE5NTBlNGZjYzM3Ng==
5
+ data.tar.gz: !binary |-
6
+ YmNmMDE5NWMxYmZhNWI0ZDI2NDA3MjdkOTNjYmI2MGUzMWY0ZTVjZQ==
7
+ !binary "U0hBNTEy":
8
+ metadata.gz: !binary |-
9
+ ZTdiM2IyMzRiOTg1Y2Y5MDc2ZWQwY2EyYjA3YTZjODEzYmM5MTU5NWVlNzBl
10
+ ZDdmYzhiNzdiOTYxOGY3YzgzNWFmZDhmMmIxODczZmY1NGM2MmM2NzI5NzVi
11
+ NWYzMGMwOGI2MWI5Mjk5NmY4MmMwM2YyZWFjNzU1MGMxMjcwYWI=
12
+ data.tar.gz: !binary |-
13
+ MjQ1NWQ4ZGVlMzNjZDZkNDVmODViOTY1ZTM4ZGZlYjhjMGVmNDQ4ZGRiNmRm
14
+ MGY1OTNhN2NkMzQ3Y2U4OGIyMDc3MTU2ZTc5MTE0ZGE4NTc4ODg2MGE5MjRm
15
+ N2M3MWQ4YzJhYzFjNTNjZTNjNDA3ZjVlM2RmZDVkMTcxNTFkNDM=
@@ -0,0 +1,36 @@
1
+ # encoding: utf-8
2
+
3
+ module Categorize
4
+ module Models
5
+ class AbstractModel
6
+ require 'ai4r'
7
+
8
+ def initialize
9
+ @gram_cache = Hash.new(nil)
10
+ @bigram_max_cache = Hash.new(nil)
11
+ end
12
+
13
+ def build_vars(records_to_tokens)
14
+ @tokens = records_to_tokens.values
15
+ @labels, @vectors = vectorize(@tokens)
16
+ build_dataset(@labels, @vectors)
17
+ end
18
+
19
+ def vectorize(token_groups)
20
+ labels = token_groups.flatten.uniq
21
+ vectors = token_groups.reduce([]) do |ary, tokens|
22
+ items = Array.new(labels.length, 0)
23
+ labels.each_with_index do |token, i|
24
+ items[i] = tokens.count(token)
25
+ end
26
+ ary << items
27
+ end
28
+ [labels, vectors]
29
+ end
30
+
31
+ def build_dataset(labels, vectors)
32
+ Ai4r::Data::DataSet.new(data_items: vectors, data_labels: labels)
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,47 @@
1
+ # encoding: utf-8
2
+
3
+ module Categorize
4
+ module Utils
5
+ class GramCollection
6
+ attr_reader :grams, :content_to_frequency, :content
7
+ attr_accessor :fitness
8
+
9
+ def initialize(content, words, invalid)
10
+ @fitness = {}
11
+ @content = content
12
+ @invalid = invalid
13
+
14
+ # TODO: n grammify this
15
+ last_word = nil
16
+ last_2nd_word = nil
17
+
18
+ @grams = {}
19
+ @content_to_frequency = words.reduce({}) do |hash, word|
20
+ bigram = trigram = nil
21
+ if last_word && last_word != word
22
+ bigram = "#{last_word} #{word}"
23
+ if last_2nd_word && word != last_2nd_word
24
+ trigram = "#{last_2nd_word} #{bigram}"
25
+ end
26
+ end
27
+
28
+ [word, bigram, trigram].compact.each do |gram|
29
+ next if @invalid.call(gram)
30
+ if hash[gram]
31
+ hash[gram] += 1
32
+ @grams[gram].frequency += 1
33
+ else
34
+ hash[gram] = 1
35
+ @grams[gram] = GramNode.new(self, gram, 1)
36
+ end
37
+ end
38
+ last_2nd_word = last_word
39
+ last_word = word
40
+ hash
41
+ end
42
+
43
+ @grams = @grams.values
44
+ end
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,16 @@
1
+ # encoding: utf-8
2
+
3
+ module Categorize
4
+ module Utils
5
+ class GramNode
6
+ attr_reader :content, :gram_collection
7
+ attr_accessor :frequency
8
+
9
+ def initialize(gram_collection, content, frequency = 0)
10
+ @gram_group = gram_collection
11
+ @content = content
12
+ @frequency = frequency
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,46 @@
1
+ # encoding: utf-8
2
+
3
+ module Categorize
4
+ module Utils
5
+ module Grams
6
+ def create_grams(query, records_to_words)
7
+ all_grams = []
8
+ @query = query
9
+ @query_terms = query.split.map(&:downcase).map(&:strip)
10
+ @query_alt = "#{@query_terms[1..-1]} #{@query_terms[0]}"
11
+
12
+ invalid = Proc.new do |gram, *args|
13
+ # remove [[gram]] if == [[query]]
14
+ gram == @query || gram == @query_alt || @query_terms.include?(gram)
15
+ end
16
+
17
+ gram_collections = records_to_words.map do |record, words|
18
+ gram_collection = GramCollection.new(record, words, invalid)
19
+ all_grams += gram_collection.grams
20
+ gram_collection
21
+ end
22
+ return gram_collections, make_grams_unique(all_grams)
23
+ end
24
+
25
+ def check_plurals(frequent_grams)
26
+ # if exists [[gram]] and [[gram]]s then remove [[gram]]s
27
+ frequent_grams_contents = frequent_grams.map(&:content)
28
+ frequent_grams.delete_if do |gram|
29
+ gram.content[-1] == 's' and
30
+ frequent_grams_contents.include?(gram.content[0...-1])
31
+ end
32
+ end
33
+
34
+ def make_grams_unique(grams)
35
+ grams.reduce({}) do |hash, gram|
36
+ if hash[gram.content]
37
+ hash[gram.content].frequency += gram.frequency
38
+ else
39
+ hash[gram.content] = gram
40
+ end
41
+ hash
42
+ end.values
43
+ end
44
+ end
45
+ end
46
+ end
metadata CHANGED
@@ -1,8 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: categorize
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
5
- prerelease:
4
+ version: 0.0.5
6
5
  platform: ruby
7
6
  authors:
8
7
  - Peter Lubell-Doughtie
@@ -22,31 +21,34 @@ files:
22
21
  - lib/categorize.rb
23
22
  - lib/categorize/model.rb
24
23
  - lib/categorize/constants.rb
24
+ - lib/categorize/models/abstract_model.rb
25
25
  - lib/categorize/models/bag_of_words.rb
26
26
  - lib/categorize/models/cluster.rb
27
27
  - lib/categorize/models/hierarchical_cluster.rb
28
+ - lib/categorize/utils/gram_collection.rb
29
+ - lib/categorize/utils/gram_node.rb
30
+ - lib/categorize/utils/grams.rb
28
31
  homepage: http://www.helioid.com/
29
32
  licenses: []
33
+ metadata: {}
30
34
  post_install_message:
31
35
  rdoc_options: []
32
36
  require_paths:
33
37
  - lib
34
38
  required_ruby_version: !ruby/object:Gem::Requirement
35
- none: false
36
39
  requirements:
37
40
  - - ! '>='
38
41
  - !ruby/object:Gem::Version
39
42
  version: '0'
40
43
  required_rubygems_version: !ruby/object:Gem::Requirement
41
- none: false
42
44
  requirements:
43
45
  - - ! '>='
44
46
  - !ruby/object:Gem::Version
45
47
  version: '0'
46
48
  requirements: []
47
49
  rubyforge_project:
48
- rubygems_version: 1.8.24
50
+ rubygems_version: 2.0.3
49
51
  signing_key:
50
- specification_version: 3
52
+ specification_version: 4
51
53
  summary: A text categorization library.
52
54
  test_files: []