known_item_search_classifier 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: df03a8dc0661439a41c7d3366a49573f463a176f
4
- data.tar.gz: 58713a770bba4a173adc4e88bb507f256b4eafc5
3
+ metadata.gz: 9723eb411c5aac044ace45f88b1529d56b5cbfb2
4
+ data.tar.gz: 34757a78f9268eb5b88d1fdb0bea1a9c1569e32d
5
5
  SHA512:
6
- metadata.gz: 470565b3b2932df41a0d02b99048746b5f5e97de476b273a9648d473d63d7e73e44d038ff2b35ffd1f0bbd413960d35df5ca1ad8e8b8902038aedb57d64225f8
7
- data.tar.gz: 7e81f8262925a653ab33c12844947f9df40b5510cf3e43d801d6a31cab7f3bf3a3de58dc9693ecbd71283ff309dc9bdb6490d1a6be25fe436726ffc09fea28e0
6
+ metadata.gz: 961dc70c84bfcb5c2ddfaa866c489c149d84e09ce0bdbb72dd170c7ebdbf0c800cff8dd22d45ab486a84c48a1cfdde91104f5d1cff7828875885d805db058848
7
+ data.tar.gz: 7b4ca3cd5cb83a9380da7509a2758eb3a6b7e1f414ee44e2a58f809ebfab68cace4ce8d6c3c94b2c0bb6676285633d91f9f9517a489168b2847e854d3b4e2160
@@ -1,8 +1,10 @@
1
1
  # Classifies search strings as either known-item searches or unknown-item searches
2
+ require 'csv'
2
3
  require 'gaussian_naive_bayes'
3
4
 
4
5
  module KnownItemSearchClassifier
5
6
  class Classifier
7
+ attr_accessor :custom_training_set
6
8
  def initialize
7
9
  set = DefaultTrainingSet.new
8
10
  @default_training_set = GaussianNaiveBayes::Classifier.new set.categories_summaries, set.categories_probabilities
@@ -11,7 +13,7 @@ module KnownItemSearchClassifier
11
13
  return classify query_string
12
14
  end
13
15
  def train training_set
14
- if defined? @custom_training_set
16
+ unless defined? @custom_training_set
15
17
  @custom_training_set = GaussianNaiveBayes::Learner.new
16
18
  end
17
19
  training_set.each do |query|
@@ -19,10 +21,10 @@ module KnownItemSearchClassifier
19
21
  end
20
22
  end
21
23
  def train_from_csv filename
22
- if defined? @custom_training_set
24
+ unless defined? @custom_training_set
23
25
  @custom_training_set = GaussianNaiveBayes::Learner.new
24
26
  end
25
- csv = CSV.read(filename)
27
+ csv = ::CSV.read(filename)
26
28
  csv.each do |line|
27
29
  submit_vector line
28
30
  end
@@ -2,20 +2,22 @@ module KnownItemSearchClassifier
2
2
  class DefaultTrainingSet
3
3
  attr_reader :categories_probabilities, :categories_summaries
4
4
  def initialize
5
- @categories_probabilities={:unknown=>0.78, :known=>0.22}
5
+ @categories_probabilities={:unknown=>0.835, :known=>0.165}
6
6
  @categories_summaries= {
7
7
  :unknown=>{
8
- 0=>{:mean=>0.2564102564102564, :standard_deviation=>0.4394771815921655},
9
- 1=>{:mean=>0.03418803418803419, :standard_deviation=>0.11344969312798027},
10
- 2=>{:mean=>0.002564102564102564, :standard_deviation=>0.0226455406828919},
11
- 3=>{:mean=>0.12991452991452992, :standard_deviation=>0.26648206508636013},
12
- 4=>{:mean=>2.7948717948717947, :standard_deviation=>2.053561836691609}},
8
+ 0=>{:mean=>0.32335329341317365, :standard_deviation=>0.4691630728112455},
9
+ 1=>{:mean=>0.01867693185058454, :standard_deviation=>0.0856521002382124},
10
+ 2=>{:mean=>0.0024950099800399197, :standard_deviation=>0.02318575984424029},
11
+ 3=>{:mean=>0.18252067293983462, :standard_deviation=>0.32649287803592736},
12
+ 4=>{:mean=>2.2634730538922154, :standard_deviation=>1.3497147972472143},
13
+ 5=>{:mean=>0.20958083832335328, :standard_deviation=>1.2933208182456999}},
13
14
  :known=>{
14
- 0=>{:mean=>0.5454545454545454, :standard_deviation=>0.5096471914376255},
15
- 1=>{:mean=>0.051659451659451655, :standard_deviation=>0.07957404805575267},
16
- 2=>{:mean=>0.021248196248196245, :standard_deviation=>0.04412470821426937},
17
- 3=>{:mean=>0.22550505050505054, :standard_deviation=>0.2520704609787127},
18
- 4=>{:mean=>7.590909090909091, :standard_deviation=>5.770690236086651}}}
15
+ 0=>{:mean=>0.3333333333333333, :standard_deviation=>0.478713553878169},
16
+ 1=>{:mean=>0.034283854046699896, :standard_deviation=>0.07844034834013752},
17
+ 2=>{:mean=>0.06397250092902267, :standard_deviation=>0.10673099909054994},
18
+ 3=>{:mean=>0.06715805055726004, :standard_deviation=>0.1488979015655406},
19
+ 4=>{:mean=>4.696969696969697, :standard_deviation=>4.9591131294116515},
20
+ 5=>{:mean=>3.9393939393939394, :standard_deviation=>5.606577576491037}}}
19
21
  end
20
22
  end
21
23
  end
@@ -12,13 +12,14 @@ module KnownItemSearchClassifier
12
12
  @punctuation_ratio = punctuation_ratio
13
13
  @determiner_ratio = determiner_ratio
14
14
  @proper_noun_ratio = proper_noun_ratio
15
+ @numeric_count = numeric_count
15
16
 
16
17
  #@num_keywords = count_keywords
17
18
  #@refers_to_an_item_that_is_known = check_against_known_titles
18
19
 
19
20
  end
20
21
  def feature_array
21
- return [@mixed_case, @punctuation_ratio, @determiner_ratio, @proper_noun_ratio, @num_words]
22
+ return [@mixed_case, @punctuation_ratio, @determiner_ratio, @proper_noun_ratio, @num_words, @numeric_count]
22
23
  end
23
24
  private
24
25
  def is_mixed_case?
@@ -35,6 +36,9 @@ module KnownItemSearchClassifier
35
36
  num_det = @tagged.scan(/\/DET/).size.to_f
36
37
  return num_det / @num_words
37
38
  end
39
+ def numeric_count
40
+ return @string.scan(/[0-9]/).length
41
+ end
38
42
  def proper_noun_ratio
39
43
  num_prop_noun = @tagged.scan(/\/NNP/).size.to_f
40
44
  return num_prop_noun / @num_words
@@ -55,6 +59,7 @@ module KnownItemSearchClassifier
55
59
  end
56
60
  def check_against_known_titles
57
61
  known_titles = [
62
+ 'fountainhead',
58
63
  'salt sugar fat',
59
64
  ]
60
65
  if known_titles.include? @query_string.downcase
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: known_item_search_classifier
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jane Sandberg
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-11-13 00:00:00.000000000 Z
11
+ date: 2017-03-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: engtagger