known_item_search_classifier 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9723eb411c5aac044ace45f88b1529d56b5cbfb2
|
4
|
+
data.tar.gz: 34757a78f9268eb5b88d1fdb0bea1a9c1569e32d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 961dc70c84bfcb5c2ddfaa866c489c149d84e09ce0bdbb72dd170c7ebdbf0c800cff8dd22d45ab486a84c48a1cfdde91104f5d1cff7828875885d805db058848
|
7
|
+
data.tar.gz: 7b4ca3cd5cb83a9380da7509a2758eb3a6b7e1f414ee44e2a58f809ebfab68cace4ce8d6c3c94b2c0bb6676285633d91f9f9517a489168b2847e854d3b4e2160
|
@@ -1,8 +1,10 @@
|
|
1
1
|
# Classifies search strings as either known-item searches or unknown-item searches
|
2
|
+
require 'csv'
|
2
3
|
require 'gaussian_naive_bayes'
|
3
4
|
|
4
5
|
module KnownItemSearchClassifier
|
5
6
|
class Classifier
|
7
|
+
attr_accessor :custom_training_set
|
6
8
|
def initialize
|
7
9
|
set = DefaultTrainingSet.new
|
8
10
|
@default_training_set = GaussianNaiveBayes::Classifier.new set.categories_summaries, set.categories_probabilities
|
@@ -11,7 +13,7 @@ module KnownItemSearchClassifier
|
|
11
13
|
return classify query_string
|
12
14
|
end
|
13
15
|
def train training_set
|
14
|
-
|
16
|
+
unless defined? @custom_training_set
|
15
17
|
@custom_training_set = GaussianNaiveBayes::Learner.new
|
16
18
|
end
|
17
19
|
training_set.each do |query|
|
@@ -19,10 +21,10 @@ module KnownItemSearchClassifier
|
|
19
21
|
end
|
20
22
|
end
|
21
23
|
def train_from_csv filename
|
22
|
-
|
24
|
+
unless defined? @custom_training_set
|
23
25
|
@custom_training_set = GaussianNaiveBayes::Learner.new
|
24
26
|
end
|
25
|
-
csv = CSV.read(filename)
|
27
|
+
csv = ::CSV.read(filename)
|
26
28
|
csv.each do |line|
|
27
29
|
submit_vector line
|
28
30
|
end
|
@@ -2,20 +2,22 @@ module KnownItemSearchClassifier
|
|
2
2
|
class DefaultTrainingSet
|
3
3
|
attr_reader :categories_probabilities, :categories_summaries
|
4
4
|
def initialize
|
5
|
-
@categories_probabilities={:unknown=>0.
|
5
|
+
@categories_probabilities={:unknown=>0.835, :known=>0.165}
|
6
6
|
@categories_summaries= {
|
7
7
|
:unknown=>{
|
8
|
-
0=>{:mean=>0.
|
9
|
-
1=>{:mean=>0.
|
10
|
-
2=>{:mean=>0.
|
11
|
-
3=>{:mean=>0.
|
12
|
-
4=>{:mean=>2.
|
8
|
+
0=>{:mean=>0.32335329341317365, :standard_deviation=>0.4691630728112455},
|
9
|
+
1=>{:mean=>0.01867693185058454, :standard_deviation=>0.0856521002382124},
|
10
|
+
2=>{:mean=>0.0024950099800399197, :standard_deviation=>0.02318575984424029},
|
11
|
+
3=>{:mean=>0.18252067293983462, :standard_deviation=>0.32649287803592736},
|
12
|
+
4=>{:mean=>2.2634730538922154, :standard_deviation=>1.3497147972472143},
|
13
|
+
5=>{:mean=>0.20958083832335328, :standard_deviation=>1.2933208182456999}},
|
13
14
|
:known=>{
|
14
|
-
0=>{:mean=>0.
|
15
|
-
1=>{:mean=>0.
|
16
|
-
2=>{:mean=>0.
|
17
|
-
3=>{:mean=>0.
|
18
|
-
4=>{:mean=>
|
15
|
+
0=>{:mean=>0.3333333333333333, :standard_deviation=>0.478713553878169},
|
16
|
+
1=>{:mean=>0.034283854046699896, :standard_deviation=>0.07844034834013752},
|
17
|
+
2=>{:mean=>0.06397250092902267, :standard_deviation=>0.10673099909054994},
|
18
|
+
3=>{:mean=>0.06715805055726004, :standard_deviation=>0.1488979015655406},
|
19
|
+
4=>{:mean=>4.696969696969697, :standard_deviation=>4.9591131294116515},
|
20
|
+
5=>{:mean=>3.9393939393939394, :standard_deviation=>5.606577576491037}}}
|
19
21
|
end
|
20
22
|
end
|
21
23
|
end
|
@@ -12,13 +12,14 @@ module KnownItemSearchClassifier
|
|
12
12
|
@punctuation_ratio = punctuation_ratio
|
13
13
|
@determiner_ratio = determiner_ratio
|
14
14
|
@proper_noun_ratio = proper_noun_ratio
|
15
|
+
@numeric_count = numeric_count
|
15
16
|
|
16
17
|
#@num_keywords = count_keywords
|
17
18
|
#@refers_to_an_item_that_is_known = check_against_known_titles
|
18
19
|
|
19
20
|
end
|
20
21
|
def feature_array
|
21
|
-
return [@mixed_case, @punctuation_ratio, @determiner_ratio, @proper_noun_ratio, @num_words]
|
22
|
+
return [@mixed_case, @punctuation_ratio, @determiner_ratio, @proper_noun_ratio, @num_words, @numeric_count]
|
22
23
|
end
|
23
24
|
private
|
24
25
|
def is_mixed_case?
|
@@ -35,6 +36,9 @@ module KnownItemSearchClassifier
|
|
35
36
|
num_det = @tagged.scan(/\/DET/).size.to_f
|
36
37
|
return num_det / @num_words
|
37
38
|
end
|
39
|
+
def numeric_count
|
40
|
+
return @string.scan(/[0-9]/).length
|
41
|
+
end
|
38
42
|
def proper_noun_ratio
|
39
43
|
num_prop_noun = @tagged.scan(/\/NNP/).size.to_f
|
40
44
|
return num_prop_noun / @num_words
|
@@ -55,6 +59,7 @@ module KnownItemSearchClassifier
|
|
55
59
|
end
|
56
60
|
def check_against_known_titles
|
57
61
|
known_titles = [
|
62
|
+
'fountainhead',
|
58
63
|
'salt sugar fat',
|
59
64
|
]
|
60
65
|
if known_titles.include? @query_string.downcase
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: known_item_search_classifier
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jane Sandberg
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2017-03-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: engtagger
|