categorize 0.0.4 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,15 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ OGNkMmQ5MzEwZGFlOWUxNWM0MzU0MTI0MTI2NzE5NTBlNGZjYzM3Ng==
5
+ data.tar.gz: !binary |-
6
+ YmNmMDE5NWMxYmZhNWI0ZDI2NDA3MjdkOTNjYmI2MGUzMWY0ZTVjZQ==
7
+ !binary "U0hBNTEy":
8
+ metadata.gz: !binary |-
9
+ ZTdiM2IyMzRiOTg1Y2Y5MDc2ZWQwY2EyYjA3YTZjODEzYmM5MTU5NWVlNzBl
10
+ ZDdmYzhiNzdiOTYxOGY3YzgzNWFmZDhmMmIxODczZmY1NGM2MmM2NzI5NzVi
11
+ NWYzMGMwOGI2MWI5Mjk5NmY4MmMwM2YyZWFjNzU1MGMxMjcwYWI=
12
+ data.tar.gz: !binary |-
13
+ MjQ1NWQ4ZGVlMzNjZDZkNDVmODViOTY1ZTM4ZGZlYjhjMGVmNDQ4ZGRiNmRm
14
+ MGY1OTNhN2NkMzQ3Y2U4OGIyMDc3MTU2ZTc5MTE0ZGE4NTc4ODg2MGE5MjRm
15
+ N2M3MWQ4YzJhYzFjNTNjZTNjNDA3ZjVlM2RmZDVkMTcxNTFkNDM=
@@ -0,0 +1,36 @@
1
+ # encoding: utf-8
2
+
3
+ module Categorize
4
+ module Models
5
+ class AbstractModel
6
+ require 'ai4r'
7
+
8
+ def initialize
9
+ @gram_cache = Hash.new(nil)
10
+ @bigram_max_cache = Hash.new(nil)
11
+ end
12
+
13
+ def build_vars(records_to_tokens)
14
+ @tokens = records_to_tokens.values
15
+ @labels, @vectors = vectorize(@tokens)
16
+ build_dataset(@labels, @vectors)
17
+ end
18
+
19
+ def vectorize(token_groups)
20
+ labels = token_groups.flatten.uniq
21
+ vectors = token_groups.reduce([]) do |ary, tokens|
22
+ items = Array.new(labels.length, 0)
23
+ labels.each_with_index do |token, i|
24
+ items[i] = tokens.count(token)
25
+ end
26
+ ary << items
27
+ end
28
+ [labels, vectors]
29
+ end
30
+
31
+ def build_dataset(labels, vectors)
32
+ Ai4r::Data::DataSet.new(data_items: vectors, data_labels: labels)
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,47 @@
1
+ # encoding: utf-8
2
+
3
+ module Categorize
4
+ module Utils
5
+ class GramCollection
6
+ attr_reader :grams, :content_to_frequency, :content
7
+ attr_accessor :fitness
8
+
9
+ def initialize(content, words, invalid)
10
+ @fitness = {}
11
+ @content = content
12
+ @invalid = invalid
13
+
14
+ # TODO: n grammify this
15
+ last_word = nil
16
+ last_2nd_word = nil
17
+
18
+ @grams = {}
19
+ @content_to_frequency = words.reduce({}) do |hash, word|
20
+ bigram = trigram = nil
21
+ if last_word && last_word != word
22
+ bigram = "#{last_word} #{word}"
23
+ if last_2nd_word && word != last_2nd_word
24
+ trigram = "#{last_2nd_word} #{bigram}"
25
+ end
26
+ end
27
+
28
+ [word, bigram, trigram].compact.each do |gram|
29
+ next if @invalid.call(gram)
30
+ if hash[gram]
31
+ hash[gram] += 1
32
+ @grams[gram].frequency += 1
33
+ else
34
+ hash[gram] = 1
35
+ @grams[gram] = GramNode.new(self, gram, 1)
36
+ end
37
+ end
38
+ last_2nd_word = last_word
39
+ last_word = word
40
+ hash
41
+ end
42
+
43
+ @grams = @grams.values
44
+ end
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,16 @@
1
+ # encoding: utf-8
2
+
3
+ module Categorize
4
+ module Utils
5
+ class GramNode
6
+ attr_reader :content, :gram_collection
7
+ attr_accessor :frequency
8
+
9
+ def initialize(gram_collection, content, frequency = 0)
10
+ @gram_group = gram_collection
11
+ @content = content
12
+ @frequency = frequency
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,46 @@
1
+ # encoding: utf-8
2
+
3
+ module Categorize
4
+ module Utils
5
+ module Grams
6
+ def create_grams(query, records_to_words)
7
+ all_grams = []
8
+ @query = query
9
+ @query_terms = query.split.map(&:downcase).map(&:strip)
10
+ @query_alt = "#{@query_terms[1..-1]} #{@query_terms[0]}"
11
+
12
+ invalid = Proc.new do |gram, *args|
13
+ # remove [[gram]] if == [[query]]
14
+ gram == @query || gram == @query_alt || @query_terms.include?(gram)
15
+ end
16
+
17
+ gram_collections = records_to_words.map do |record, words|
18
+ gram_collection = GramCollection.new(record, words, invalid)
19
+ all_grams += gram_collection.grams
20
+ gram_collection
21
+ end
22
+ return gram_collections, make_grams_unique(all_grams)
23
+ end
24
+
25
+ def check_plurals(frequent_grams)
26
+ # if exists [[gram]] and [[gram]]s then remove [[gram]]s
27
+ frequent_grams_contents = frequent_grams.map(&:content)
28
+ frequent_grams.delete_if do |gram|
29
+ gram.content[-1] == 's' and
30
+ frequent_grams_contents.include?(gram.content[0...-1])
31
+ end
32
+ end
33
+
34
+ def make_grams_unique(grams)
35
+ grams.reduce({}) do |hash, gram|
36
+ if hash[gram.content]
37
+ hash[gram.content].frequency += gram.frequency
38
+ else
39
+ hash[gram.content] = gram
40
+ end
41
+ hash
42
+ end.values
43
+ end
44
+ end
45
+ end
46
+ end
metadata CHANGED
@@ -1,8 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: categorize
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
5
- prerelease:
4
+ version: 0.0.5
6
5
  platform: ruby
7
6
  authors:
8
7
  - Peter Lubell-Doughtie
@@ -22,31 +21,34 @@ files:
22
21
  - lib/categorize.rb
23
22
  - lib/categorize/model.rb
24
23
  - lib/categorize/constants.rb
24
+ - lib/categorize/models/abstract_model.rb
25
25
  - lib/categorize/models/bag_of_words.rb
26
26
  - lib/categorize/models/cluster.rb
27
27
  - lib/categorize/models/hierarchical_cluster.rb
28
+ - lib/categorize/utils/gram_collection.rb
29
+ - lib/categorize/utils/gram_node.rb
30
+ - lib/categorize/utils/grams.rb
28
31
  homepage: http://www.helioid.com/
29
32
  licenses: []
33
+ metadata: {}
30
34
  post_install_message:
31
35
  rdoc_options: []
32
36
  require_paths:
33
37
  - lib
34
38
  required_ruby_version: !ruby/object:Gem::Requirement
35
- none: false
36
39
  requirements:
37
40
  - - ! '>='
38
41
  - !ruby/object:Gem::Version
39
42
  version: '0'
40
43
  required_rubygems_version: !ruby/object:Gem::Requirement
41
- none: false
42
44
  requirements:
43
45
  - - ! '>='
44
46
  - !ruby/object:Gem::Version
45
47
  version: '0'
46
48
  requirements: []
47
49
  rubyforge_project:
48
- rubygems_version: 1.8.24
50
+ rubygems_version: 2.0.3
49
51
  signing_key:
50
- specification_version: 3
52
+ specification_version: 4
51
53
  summary: A text categorization library.
52
54
  test_files: []