known_item_search_classifier 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: a95567708e0b56c79c3a102e1d7c72e493e5660518de3b24c8fc42a691609938
|
4
|
+
data.tar.gz: 70ea59d9d7c0451b3d454506e578c2761c12e7d226edca852431c76bee1a9456
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4fb37b0932e9e0c32f9ec0ef6bdc563bd7e4e4cca5f401186daec4ae8d3be112b96478a9f04cf715620144e5db30e340959db808d5cc99841360dd72d480984d
|
7
|
+
data.tar.gz: 96777f8fa22a9208dc4e22a76a4c74dd57785c32a52de386d0e78678880a0d0faa020e0390fe8d1275c2cc3326cdf6cfa2fae6dcee4bbe299c66c79918a696fd
|
@@ -3,53 +3,54 @@ require 'csv'
|
|
3
3
|
require 'gaussian_naive_bayes'
|
4
4
|
|
5
5
|
module KnownItemSearchClassifier
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
end
|
6
|
+
class Classifier
|
7
|
+
def initialize
|
8
|
+
set = DefaultTrainingSet.new
|
9
|
+
@default_training_set = GaussianNaiveBayes::Classifier.new set.categories_summaries,
|
10
|
+
set.categories_probabilities
|
11
|
+
end
|
12
|
+
|
13
|
+
def is_known_item_search?(query_string)
|
14
|
+
classify query_string
|
15
|
+
end
|
16
|
+
|
17
|
+
def train(training_set)
|
18
|
+
@custom_training_set = GaussianNaiveBayes::Learner.new unless defined? @custom_training_set
|
19
|
+
training_set.each do |query|
|
20
|
+
submit_vector query
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def train_from_csv(filename)
|
25
|
+
@custom_training_set = GaussianNaiveBayes::Learner.new unless defined? @custom_training_set
|
26
|
+
csv = ::CSV.read(filename)
|
27
|
+
csv.each do |line|
|
28
|
+
submit_vector line
|
29
|
+
end
|
30
|
+
end
|
32
31
|
|
33
32
|
private
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
33
|
+
|
34
|
+
attr_reader :custom_tr
|
35
|
+
|
36
|
+
def classify(string)
|
37
|
+
f = FeatureExtractor.new string
|
38
|
+
feature_array = f.feature_array
|
39
|
+
if defined? @custom_training_set
|
40
|
+
classifier = @custom_training_set.classifier
|
41
|
+
query_class = classifier.classify(feature_array)
|
42
|
+
else
|
43
|
+
query_class = @default_training_set.classify(feature_array)
|
44
|
+
end
|
45
|
+
return query_class
|
46
|
+
return true if :known == query_class
|
47
|
+
|
48
|
+
false
|
49
|
+
end
|
50
|
+
|
51
|
+
def submit_vector(arr)
|
52
|
+
f = FeatureExtractor.new arr[0]
|
53
|
+
@custom_training_set.train f.feature_array, arr[1]
|
54
54
|
end
|
55
|
+
end
|
55
56
|
end
|
@@ -1,23 +1,24 @@
|
|
1
1
|
module KnownItemSearchClassifier
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
2
|
+
class DefaultTrainingSet
|
3
|
+
attr_reader :categories_probabilities, :categories_summaries
|
4
|
+
|
5
|
+
def initialize
|
6
|
+
@categories_probabilities = { 'known' => 0.3333333333333333, 'unknown' => 0.6666666666666666 }
|
7
|
+
@categories_summaries =
|
8
|
+
{ 'known' =>
|
9
|
+
{ 0 => { mean: 0.6, standard_deviation: 0.5 },
|
10
|
+
1 => { mean: 0.0516060606060606, standard_deviation: 0.09910312916958242 },
|
11
|
+
2 => { mean: 0.06633333333333333, standard_deviation: 0.13412266359153804 },
|
12
|
+
3 => { mean: 0.2575454545454545, standard_deviation: 0.27976953051588926 },
|
13
|
+
4 => { mean: 4.76, standard_deviation: 3.8867295592395754 },
|
14
|
+
5 => { mean: 3.48, standard_deviation: 4.91697739131132 } },
|
15
|
+
'unknown' =>
|
16
|
+
{ 0 => { mean: 0.18, standard_deviation: 0.38808793449160356 },
|
17
|
+
1 => { mean: 0.03966666666666667, standard_deviation: 0.1241245990920947 },
|
18
|
+
2 => { mean: 0.009000000000000001, standard_deviation: 0.04482391854210637 },
|
19
|
+
3 => { mean: 0.11, standard_deviation: 0.25134558515041244 },
|
20
|
+
4 => { mean: 2.44, standard_deviation: 1.0720950308167836 },
|
21
|
+
5 => { mean: 0.14, standard_deviation: 0.7001457574195914 } } }
|
22
22
|
end
|
23
|
+
end
|
23
24
|
end
|
@@ -1,73 +1,75 @@
|
|
1
1
|
require 'engtagger'
|
2
2
|
|
3
3
|
module KnownItemSearchClassifier
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
end
|
21
|
-
def feature_array
|
22
|
-
return [@mixed_case, @punctuation_ratio, @determiner_ratio, @proper_noun_ratio, @num_words, @numeric_count]
|
23
|
-
end
|
24
|
-
private
|
25
|
-
def is_mixed_case?
|
26
|
-
if @string =~ /[A-Z]/ and @string =~ /[a-z]/
|
27
|
-
return 1.0
|
28
|
-
end
|
29
|
-
return 0.0
|
30
|
-
end
|
31
|
-
def punctuation_ratio
|
32
|
-
num_punct = @tagged.scan(/\/PP/).size.to_f
|
33
|
-
return num_punct / @num_words
|
34
|
-
end
|
35
|
-
def determiner_ratio
|
36
|
-
num_det = @tagged.scan(/\/DET/).size.to_f
|
37
|
-
return num_det / @num_words
|
38
|
-
end
|
39
|
-
def numeric_count
|
40
|
-
return @string.scan(/[0-9]/).length
|
41
|
-
end
|
42
|
-
def proper_noun_ratio
|
43
|
-
num_prop_noun = @tagged.scan(/\/NNP/).size.to_f
|
44
|
-
return num_prop_noun / @num_words
|
45
|
-
end
|
46
|
-
def count_keywords
|
47
|
-
end
|
48
|
-
def check_against_known_titles
|
49
|
-
end
|
50
|
-
def count_keywords
|
51
|
-
keywords_to_match = ['journal', 'course', 'textbook']
|
52
|
-
num_keywords = 0
|
53
|
-
@query_string.split.each do |word|
|
54
|
-
if keywords_to_match.include? word.gsub(/[[:punct:]]/, '').downcase
|
55
|
-
num_keywords = num_keywords + 1
|
56
|
-
end
|
57
|
-
end
|
58
|
-
return num_keywords
|
59
|
-
end
|
60
|
-
def check_against_known_titles
|
61
|
-
known_titles = [
|
62
|
-
'fountainhead',
|
63
|
-
'salt sugar fat',
|
64
|
-
]
|
65
|
-
if known_titles.include? @query_string.downcase
|
66
|
-
return true
|
67
|
-
else
|
68
|
-
return false
|
69
|
-
end
|
70
|
-
end
|
4
|
+
class FeatureExtractor
|
5
|
+
def initialize(string)
|
6
|
+
@string = string
|
7
|
+
tagger = EngTagger.new
|
8
|
+
@tagged = tagger.get_readable string
|
9
|
+
@num_words = @tagged.scan(%r{/[A-Z]{2}}).size.to_f
|
10
|
+
|
11
|
+
@mixed_case = is_mixed_case?
|
12
|
+
@punctuation_ratio = punctuation_ratio
|
13
|
+
@determiner_ratio = determiner_ratio
|
14
|
+
@proper_noun_ratio = proper_noun_ratio
|
15
|
+
@numeric_count = numeric_count
|
16
|
+
|
17
|
+
# @num_keywords = count_keywords
|
18
|
+
# @refers_to_an_item_that_is_known = check_against_known_titles
|
71
19
|
end
|
72
|
-
end
|
73
20
|
|
21
|
+
def feature_array
|
22
|
+
[@mixed_case, @punctuation_ratio, @determiner_ratio, @proper_noun_ratio, @num_words, @numeric_count]
|
23
|
+
end
|
24
|
+
|
25
|
+
private
|
26
|
+
|
27
|
+
def is_mixed_case?
|
28
|
+
return 1.0 if @string =~ /[A-Z]/ and @string =~ /[a-z]/
|
29
|
+
|
30
|
+
0.0
|
31
|
+
end
|
32
|
+
|
33
|
+
def punctuation_ratio
|
34
|
+
num_punct = @tagged.scan(%r{/PP}).size.to_f
|
35
|
+
num_punct / @num_words
|
36
|
+
end
|
37
|
+
|
38
|
+
def determiner_ratio
|
39
|
+
num_det = @tagged.scan(%r{/DET}).size.to_f
|
40
|
+
num_det / @num_words
|
41
|
+
end
|
42
|
+
|
43
|
+
def numeric_count
|
44
|
+
@string.scan(/[0-9]/).length
|
45
|
+
end
|
46
|
+
|
47
|
+
def proper_noun_ratio
|
48
|
+
num_prop_noun = @tagged.scan(%r{/NNP}).size.to_f
|
49
|
+
num_prop_noun / @num_words
|
50
|
+
end
|
51
|
+
|
52
|
+
def count_keywords; end
|
53
|
+
|
54
|
+
def check_against_known_titles; end
|
55
|
+
|
56
|
+
def count_keywords
|
57
|
+
keywords_to_match = %w[journal course textbook]
|
58
|
+
num_keywords = 0
|
59
|
+
@query_string.split.each do |word|
|
60
|
+
num_keywords += 1 if keywords_to_match.include? word.gsub(/[[:punct:]]/, '').downcase
|
61
|
+
end
|
62
|
+
num_keywords
|
63
|
+
end
|
64
|
+
|
65
|
+
def check_against_known_titles
|
66
|
+
known_titles = [
|
67
|
+
'fountainhead',
|
68
|
+
'salt sugar fat'
|
69
|
+
]
|
70
|
+
return true if known_titles.include? @query_string.downcase
|
71
|
+
|
72
|
+
false
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
@@ -1,62 +1,49 @@
|
|
1
|
-
require 'coveralls'
|
2
|
-
Coveralls.wear!
|
3
1
|
require 'minitest/autorun'
|
4
2
|
require './lib/known_item_search_classifier'
|
5
3
|
|
6
|
-
|
7
4
|
class KnownItemSearchClassifierTest < Minitest::Test
|
8
|
-
|
5
|
+
classifier = KnownItemSearchClassifier::Classifier.new
|
9
6
|
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
7
|
+
known_item_training_set = [
|
8
|
+
# 'hobbit first edition', -- classifier incorrectly classifies this as unknown
|
9
|
+
# 'my soul is rested', -- classifier incorrectly classifies this as unknown
|
10
|
+
# 'new yorker', -- classifier incorrectly classifies this as unknown
|
11
|
+
# 'when harry met sally', -- classifier incorrectly classifies this as unknown
|
12
|
+
# '"neo tekunoroji"', -- classifier incorrectly classifies this as unknown
|
13
|
+
'99131236427206421',
|
14
|
+
'A decision making model for selecting start-up businesses in a government venture capital scheme',
|
15
|
+
# 'Dostoevsky Brothers Karamazov', -- classifier incorrectly classifies this as unknown
|
16
|
+
# 'Lawrence Classic American Literature', -- classifier incorrectly classifies this as unknown
|
17
|
+
# 'salt sugar fat', -- classifier incorrectly classifies this as unknown
|
18
|
+
'Robinson Ken. Creative Schools: The Grassroots Revolution That’s Transforming Eduction. Viking. 2015. Print',
|
19
|
+
'the inconvenient truth',
|
20
|
+
'Polarization: What Everyone Needs to Know',
|
21
|
+
'little house on the'
|
22
|
+
]
|
23
|
+
known_item_training_set.each do |query|
|
24
|
+
cleaned_up_query = query.gsub(/[[:punct:]]/, '').gsub(/[[:space:]]/, '_')
|
25
|
+
define_method("test_#{cleaned_up_query}_is_classified_as_known_item") do
|
26
|
+
assert_equal(:known, classifier.is_known_item_search?(query).to_sym)
|
29
27
|
end
|
28
|
+
end
|
30
29
|
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
"Professional baking ",
|
49
|
-
"concussions after the nfl",
|
50
|
-
"IVF the US",
|
51
|
-
"adoption children the US",
|
52
|
-
"Films for the hearing impaired",
|
53
|
-
"wolves and the ecosystem",
|
54
|
-
"dr. martin luther king",
|
55
|
-
]
|
56
|
-
unknown_item_training_set.each do |query|
|
57
|
-
cleaned_up_query = query.gsub(/[[:punct:]]/, '').gsub(/[[:space:]]/, '_')
|
58
|
-
define_method("test_#{cleaned_up_query}_is_not_false_positive") do
|
59
|
-
assert_equal(:unknown, classifier.is_known_item_search?(query))
|
60
|
-
end
|
30
|
+
unknown_item_training_set = [
|
31
|
+
'colonial mexico textiles',
|
32
|
+
'history of horses',
|
33
|
+
'medical expertise COVID',
|
34
|
+
'music and sexuality',
|
35
|
+
'paper industry',
|
36
|
+
'sun ra',
|
37
|
+
# 'concussions after the nfl', -- classifier incorrectly classifies this as known
|
38
|
+
'Professional baking ',
|
39
|
+
'Manos chatzidakis',
|
40
|
+
'whey protein',
|
41
|
+
'benefits of eating healthyhy'
|
42
|
+
]
|
43
|
+
unknown_item_training_set.each do |query|
|
44
|
+
cleaned_up_query = query.gsub(/[[:punct:]]/, '').gsub(/[[:space:]]/, '_')
|
45
|
+
define_method("test_#{cleaned_up_query}_is_not_false_positive") do
|
46
|
+
assert_equal(:unknown, classifier.is_known_item_search?(query).to_sym)
|
61
47
|
end
|
48
|
+
end
|
62
49
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: known_item_search_classifier
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jane Sandberg
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2024-11-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: engtagger
|
@@ -53,21 +53,35 @@ dependencies:
|
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '0'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
|
-
name:
|
56
|
+
name: rake
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
|
-
- -
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: rubocop
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
60
74
|
- !ruby/object:Gem::Version
|
61
|
-
version: 0
|
75
|
+
version: '0'
|
62
76
|
type: :development
|
63
77
|
prerelease: false
|
64
78
|
version_requirements: !ruby/object:Gem::Requirement
|
65
79
|
requirements:
|
66
|
-
- -
|
80
|
+
- - ">="
|
67
81
|
- !ruby/object:Gem::Version
|
68
|
-
version: 0
|
82
|
+
version: '0'
|
69
83
|
description: Classify search query strings
|
70
|
-
email:
|
84
|
+
email:
|
71
85
|
executables: []
|
72
86
|
extensions: []
|
73
87
|
extra_rdoc_files: []
|
@@ -81,7 +95,7 @@ homepage: https://github.com/sandbergja/known_item_search_classifier
|
|
81
95
|
licenses:
|
82
96
|
- MIT
|
83
97
|
metadata: {}
|
84
|
-
post_install_message:
|
98
|
+
post_install_message:
|
85
99
|
rdoc_options: []
|
86
100
|
require_paths:
|
87
101
|
- lib
|
@@ -89,16 +103,15 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
89
103
|
requirements:
|
90
104
|
- - ">="
|
91
105
|
- !ruby/object:Gem::Version
|
92
|
-
version:
|
106
|
+
version: 3.0.0
|
93
107
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
94
108
|
requirements:
|
95
109
|
- - ">="
|
96
110
|
- !ruby/object:Gem::Version
|
97
111
|
version: '0'
|
98
112
|
requirements: []
|
99
|
-
|
100
|
-
|
101
|
-
signing_key:
|
113
|
+
rubygems_version: 3.5.16
|
114
|
+
signing_key:
|
102
115
|
specification_version: 4
|
103
116
|
summary: A ruby gem that classifies search query strings as either known-item searches
|
104
117
|
or unknown-item searches
|