known_item_search_classifier 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9723eb411c5aac044ace45f88b1529d56b5cbfb2
|
4
|
+
data.tar.gz: 34757a78f9268eb5b88d1fdb0bea1a9c1569e32d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 961dc70c84bfcb5c2ddfaa866c489c149d84e09ce0bdbb72dd170c7ebdbf0c800cff8dd22d45ab486a84c48a1cfdde91104f5d1cff7828875885d805db058848
|
7
|
+
data.tar.gz: 7b4ca3cd5cb83a9380da7509a2758eb3a6b7e1f414ee44e2a58f809ebfab68cace4ce8d6c3c94b2c0bb6676285633d91f9f9517a489168b2847e854d3b4e2160
|
@@ -1,8 +1,10 @@
|
|
1
1
|
# Classifies search strings as either known-item searches or unknown-item searches
|
2
|
+
require 'csv'
|
2
3
|
require 'gaussian_naive_bayes'
|
3
4
|
|
4
5
|
module KnownItemSearchClassifier
|
5
6
|
class Classifier
|
7
|
+
attr_accessor :custom_training_set
|
6
8
|
def initialize
|
7
9
|
set = DefaultTrainingSet.new
|
8
10
|
@default_training_set = GaussianNaiveBayes::Classifier.new set.categories_summaries, set.categories_probabilities
|
@@ -11,7 +13,7 @@ module KnownItemSearchClassifier
|
|
11
13
|
return classify query_string
|
12
14
|
end
|
13
15
|
def train training_set
|
14
|
-
|
16
|
+
unless defined? @custom_training_set
|
15
17
|
@custom_training_set = GaussianNaiveBayes::Learner.new
|
16
18
|
end
|
17
19
|
training_set.each do |query|
|
@@ -19,10 +21,10 @@ module KnownItemSearchClassifier
|
|
19
21
|
end
|
20
22
|
end
|
21
23
|
def train_from_csv filename
|
22
|
-
|
24
|
+
unless defined? @custom_training_set
|
23
25
|
@custom_training_set = GaussianNaiveBayes::Learner.new
|
24
26
|
end
|
25
|
-
csv = CSV.read(filename)
|
27
|
+
csv = ::CSV.read(filename)
|
26
28
|
csv.each do |line|
|
27
29
|
submit_vector line
|
28
30
|
end
|
@@ -2,20 +2,22 @@ module KnownItemSearchClassifier
|
|
2
2
|
class DefaultTrainingSet
|
3
3
|
attr_reader :categories_probabilities, :categories_summaries
|
4
4
|
def initialize
|
5
|
-
@categories_probabilities={:unknown=>0.
|
5
|
+
@categories_probabilities={:unknown=>0.835, :known=>0.165}
|
6
6
|
@categories_summaries= {
|
7
7
|
:unknown=>{
|
8
|
-
0=>{:mean=>0.
|
9
|
-
1=>{:mean=>0.
|
10
|
-
2=>{:mean=>0.
|
11
|
-
3=>{:mean=>0.
|
12
|
-
4=>{:mean=>2.
|
8
|
+
0=>{:mean=>0.32335329341317365, :standard_deviation=>0.4691630728112455},
|
9
|
+
1=>{:mean=>0.01867693185058454, :standard_deviation=>0.0856521002382124},
|
10
|
+
2=>{:mean=>0.0024950099800399197, :standard_deviation=>0.02318575984424029},
|
11
|
+
3=>{:mean=>0.18252067293983462, :standard_deviation=>0.32649287803592736},
|
12
|
+
4=>{:mean=>2.2634730538922154, :standard_deviation=>1.3497147972472143},
|
13
|
+
5=>{:mean=>0.20958083832335328, :standard_deviation=>1.2933208182456999}},
|
13
14
|
:known=>{
|
14
|
-
0=>{:mean=>0.
|
15
|
-
1=>{:mean=>0.
|
16
|
-
2=>{:mean=>0.
|
17
|
-
3=>{:mean=>0.
|
18
|
-
4=>{:mean=>
|
15
|
+
0=>{:mean=>0.3333333333333333, :standard_deviation=>0.478713553878169},
|
16
|
+
1=>{:mean=>0.034283854046699896, :standard_deviation=>0.07844034834013752},
|
17
|
+
2=>{:mean=>0.06397250092902267, :standard_deviation=>0.10673099909054994},
|
18
|
+
3=>{:mean=>0.06715805055726004, :standard_deviation=>0.1488979015655406},
|
19
|
+
4=>{:mean=>4.696969696969697, :standard_deviation=>4.9591131294116515},
|
20
|
+
5=>{:mean=>3.9393939393939394, :standard_deviation=>5.606577576491037}}}
|
19
21
|
end
|
20
22
|
end
|
21
23
|
end
|
@@ -12,13 +12,14 @@ module KnownItemSearchClassifier
|
|
12
12
|
@punctuation_ratio = punctuation_ratio
|
13
13
|
@determiner_ratio = determiner_ratio
|
14
14
|
@proper_noun_ratio = proper_noun_ratio
|
15
|
+
@numeric_count = numeric_count
|
15
16
|
|
16
17
|
#@num_keywords = count_keywords
|
17
18
|
#@refers_to_an_item_that_is_known = check_against_known_titles
|
18
19
|
|
19
20
|
end
|
20
21
|
def feature_array
|
21
|
-
return [@mixed_case, @punctuation_ratio, @determiner_ratio, @proper_noun_ratio, @num_words]
|
22
|
+
return [@mixed_case, @punctuation_ratio, @determiner_ratio, @proper_noun_ratio, @num_words, @numeric_count]
|
22
23
|
end
|
23
24
|
private
|
24
25
|
def is_mixed_case?
|
@@ -35,6 +36,9 @@ module KnownItemSearchClassifier
|
|
35
36
|
num_det = @tagged.scan(/\/DET/).size.to_f
|
36
37
|
return num_det / @num_words
|
37
38
|
end
|
39
|
+
def numeric_count
|
40
|
+
return @string.scan(/[0-9]/).length
|
41
|
+
end
|
38
42
|
def proper_noun_ratio
|
39
43
|
num_prop_noun = @tagged.scan(/\/NNP/).size.to_f
|
40
44
|
return num_prop_noun / @num_words
|
@@ -55,6 +59,7 @@ module KnownItemSearchClassifier
|
|
55
59
|
end
|
56
60
|
def check_against_known_titles
|
57
61
|
known_titles = [
|
62
|
+
'fountainhead',
|
58
63
|
'salt sugar fat',
|
59
64
|
]
|
60
65
|
if known_titles.include? @query_string.downcase
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: known_item_search_classifier
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jane Sandberg
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2017-03-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: engtagger
|