known_item_search_classifier 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: df03a8dc0661439a41c7d3366a49573f463a176f
4
- data.tar.gz: 58713a770bba4a173adc4e88bb507f256b4eafc5
3
+ metadata.gz: 9723eb411c5aac044ace45f88b1529d56b5cbfb2
4
+ data.tar.gz: 34757a78f9268eb5b88d1fdb0bea1a9c1569e32d
5
5
  SHA512:
6
- metadata.gz: 470565b3b2932df41a0d02b99048746b5f5e97de476b273a9648d473d63d7e73e44d038ff2b35ffd1f0bbd413960d35df5ca1ad8e8b8902038aedb57d64225f8
7
- data.tar.gz: 7e81f8262925a653ab33c12844947f9df40b5510cf3e43d801d6a31cab7f3bf3a3de58dc9693ecbd71283ff309dc9bdb6490d1a6be25fe436726ffc09fea28e0
6
+ metadata.gz: 961dc70c84bfcb5c2ddfaa866c489c149d84e09ce0bdbb72dd170c7ebdbf0c800cff8dd22d45ab486a84c48a1cfdde91104f5d1cff7828875885d805db058848
7
+ data.tar.gz: 7b4ca3cd5cb83a9380da7509a2758eb3a6b7e1f414ee44e2a58f809ebfab68cace4ce8d6c3c94b2c0bb6676285633d91f9f9517a489168b2847e854d3b4e2160
@@ -1,8 +1,10 @@
1
1
  # Classifies search strings as either known-item searches or unknown-item searches
2
+ require 'csv'
2
3
  require 'gaussian_naive_bayes'
3
4
 
4
5
  module KnownItemSearchClassifier
5
6
  class Classifier
7
+ attr_accessor :custom_training_set
6
8
  def initialize
7
9
  set = DefaultTrainingSet.new
8
10
  @default_training_set = GaussianNaiveBayes::Classifier.new set.categories_summaries, set.categories_probabilities
@@ -11,7 +13,7 @@ module KnownItemSearchClassifier
11
13
  return classify query_string
12
14
  end
13
15
  def train training_set
14
- if defined? @custom_training_set
16
+ unless defined? @custom_training_set
15
17
  @custom_training_set = GaussianNaiveBayes::Learner.new
16
18
  end
17
19
  training_set.each do |query|
@@ -19,10 +21,10 @@ module KnownItemSearchClassifier
19
21
  end
20
22
  end
21
23
  def train_from_csv filename
22
- if defined? @custom_training_set
24
+ unless defined? @custom_training_set
23
25
  @custom_training_set = GaussianNaiveBayes::Learner.new
24
26
  end
25
- csv = CSV.read(filename)
27
+ csv = ::CSV.read(filename)
26
28
  csv.each do |line|
27
29
  submit_vector line
28
30
  end
@@ -2,20 +2,22 @@ module KnownItemSearchClassifier
2
2
  class DefaultTrainingSet
3
3
  attr_reader :categories_probabilities, :categories_summaries
4
4
  def initialize
5
- @categories_probabilities={:unknown=>0.78, :known=>0.22}
5
+ @categories_probabilities={:unknown=>0.835, :known=>0.165}
6
6
  @categories_summaries= {
7
7
  :unknown=>{
8
- 0=>{:mean=>0.2564102564102564, :standard_deviation=>0.4394771815921655},
9
- 1=>{:mean=>0.03418803418803419, :standard_deviation=>0.11344969312798027},
10
- 2=>{:mean=>0.002564102564102564, :standard_deviation=>0.0226455406828919},
11
- 3=>{:mean=>0.12991452991452992, :standard_deviation=>0.26648206508636013},
12
- 4=>{:mean=>2.7948717948717947, :standard_deviation=>2.053561836691609}},
8
+ 0=>{:mean=>0.32335329341317365, :standard_deviation=>0.4691630728112455},
9
+ 1=>{:mean=>0.01867693185058454, :standard_deviation=>0.0856521002382124},
10
+ 2=>{:mean=>0.0024950099800399197, :standard_deviation=>0.02318575984424029},
11
+ 3=>{:mean=>0.18252067293983462, :standard_deviation=>0.32649287803592736},
12
+ 4=>{:mean=>2.2634730538922154, :standard_deviation=>1.3497147972472143},
13
+ 5=>{:mean=>0.20958083832335328, :standard_deviation=>1.2933208182456999}},
13
14
  :known=>{
14
- 0=>{:mean=>0.5454545454545454, :standard_deviation=>0.5096471914376255},
15
- 1=>{:mean=>0.051659451659451655, :standard_deviation=>0.07957404805575267},
16
- 2=>{:mean=>0.021248196248196245, :standard_deviation=>0.04412470821426937},
17
- 3=>{:mean=>0.22550505050505054, :standard_deviation=>0.2520704609787127},
18
- 4=>{:mean=>7.590909090909091, :standard_deviation=>5.770690236086651}}}
15
+ 0=>{:mean=>0.3333333333333333, :standard_deviation=>0.478713553878169},
16
+ 1=>{:mean=>0.034283854046699896, :standard_deviation=>0.07844034834013752},
17
+ 2=>{:mean=>0.06397250092902267, :standard_deviation=>0.10673099909054994},
18
+ 3=>{:mean=>0.06715805055726004, :standard_deviation=>0.1488979015655406},
19
+ 4=>{:mean=>4.696969696969697, :standard_deviation=>4.9591131294116515},
20
+ 5=>{:mean=>3.9393939393939394, :standard_deviation=>5.606577576491037}}}
19
21
  end
20
22
  end
21
23
  end
@@ -12,13 +12,14 @@ module KnownItemSearchClassifier
12
12
  @punctuation_ratio = punctuation_ratio
13
13
  @determiner_ratio = determiner_ratio
14
14
  @proper_noun_ratio = proper_noun_ratio
15
+ @numeric_count = numeric_count
15
16
 
16
17
  #@num_keywords = count_keywords
17
18
  #@refers_to_an_item_that_is_known = check_against_known_titles
18
19
 
19
20
  end
20
21
  def feature_array
21
- return [@mixed_case, @punctuation_ratio, @determiner_ratio, @proper_noun_ratio, @num_words]
22
+ return [@mixed_case, @punctuation_ratio, @determiner_ratio, @proper_noun_ratio, @num_words, @numeric_count]
22
23
  end
23
24
  private
24
25
  def is_mixed_case?
@@ -35,6 +36,9 @@ module KnownItemSearchClassifier
35
36
  num_det = @tagged.scan(/\/DET/).size.to_f
36
37
  return num_det / @num_words
37
38
  end
39
+ def numeric_count
40
+ return @string.scan(/[0-9]/).length
41
+ end
38
42
  def proper_noun_ratio
39
43
  num_prop_noun = @tagged.scan(/\/NNP/).size.to_f
40
44
  return num_prop_noun / @num_words
@@ -55,6 +59,7 @@ module KnownItemSearchClassifier
55
59
  end
56
60
  def check_against_known_titles
57
61
  known_titles = [
62
+ 'fountainhead',
58
63
  'salt sugar fat',
59
64
  ]
60
65
  if known_titles.include? @query_string.downcase
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: known_item_search_classifier
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jane Sandberg
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-11-13 00:00:00.000000000 Z
11
+ date: 2017-03-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: engtagger