known_item_search_classifier 0.2.1 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: a95567708e0b56c79c3a102e1d7c72e493e5660518de3b24c8fc42a691609938
|
4
|
+
data.tar.gz: 70ea59d9d7c0451b3d454506e578c2761c12e7d226edca852431c76bee1a9456
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4fb37b0932e9e0c32f9ec0ef6bdc563bd7e4e4cca5f401186daec4ae8d3be112b96478a9f04cf715620144e5db30e340959db808d5cc99841360dd72d480984d
|
7
|
+
data.tar.gz: 96777f8fa22a9208dc4e22a76a4c74dd57785c32a52de386d0e78678880a0d0faa020e0390fe8d1275c2cc3326cdf6cfa2fae6dcee4bbe299c66c79918a696fd
|
@@ -3,52 +3,54 @@ require 'csv'
|
|
3
3
|
require 'gaussian_naive_bayes'
|
4
4
|
|
5
5
|
module KnownItemSearchClassifier
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
6
|
+
class Classifier
|
7
|
+
def initialize
|
8
|
+
set = DefaultTrainingSet.new
|
9
|
+
@default_training_set = GaussianNaiveBayes::Classifier.new set.categories_summaries,
|
10
|
+
set.categories_probabilities
|
11
|
+
end
|
12
|
+
|
13
|
+
def is_known_item_search?(query_string)
|
14
|
+
classify query_string
|
15
|
+
end
|
16
|
+
|
17
|
+
def train(training_set)
|
18
|
+
@custom_training_set = GaussianNaiveBayes::Learner.new unless defined? @custom_training_set
|
19
|
+
training_set.each do |query|
|
20
|
+
submit_vector query
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def train_from_csv(filename)
|
25
|
+
@custom_training_set = GaussianNaiveBayes::Learner.new unless defined? @custom_training_set
|
26
|
+
csv = ::CSV.read(filename)
|
27
|
+
csv.each do |line|
|
28
|
+
submit_vector line
|
29
|
+
end
|
30
|
+
end
|
31
31
|
|
32
32
|
private
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
33
|
+
|
34
|
+
attr_reader :custom_tr
|
35
|
+
|
36
|
+
def classify(string)
|
37
|
+
f = FeatureExtractor.new string
|
38
|
+
feature_array = f.feature_array
|
39
|
+
if defined? @custom_training_set
|
40
|
+
classifier = @custom_training_set.classifier
|
41
|
+
query_class = classifier.classify(feature_array)
|
42
|
+
else
|
43
|
+
query_class = @default_training_set.classify(feature_array)
|
44
|
+
end
|
45
|
+
return query_class
|
46
|
+
return true if :known == query_class
|
47
|
+
|
48
|
+
false
|
49
|
+
end
|
50
|
+
|
51
|
+
def submit_vector(arr)
|
52
|
+
f = FeatureExtractor.new arr[0]
|
53
|
+
@custom_training_set.train f.feature_array, arr[1]
|
53
54
|
end
|
55
|
+
end
|
54
56
|
end
|
@@ -1,23 +1,24 @@
|
|
1
1
|
module KnownItemSearchClassifier
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
2
|
+
class DefaultTrainingSet
|
3
|
+
attr_reader :categories_probabilities, :categories_summaries
|
4
|
+
|
5
|
+
def initialize
|
6
|
+
@categories_probabilities = { 'known' => 0.3333333333333333, 'unknown' => 0.6666666666666666 }
|
7
|
+
@categories_summaries =
|
8
|
+
{ 'known' =>
|
9
|
+
{ 0 => { mean: 0.6, standard_deviation: 0.5 },
|
10
|
+
1 => { mean: 0.0516060606060606, standard_deviation: 0.09910312916958242 },
|
11
|
+
2 => { mean: 0.06633333333333333, standard_deviation: 0.13412266359153804 },
|
12
|
+
3 => { mean: 0.2575454545454545, standard_deviation: 0.27976953051588926 },
|
13
|
+
4 => { mean: 4.76, standard_deviation: 3.8867295592395754 },
|
14
|
+
5 => { mean: 3.48, standard_deviation: 4.91697739131132 } },
|
15
|
+
'unknown' =>
|
16
|
+
{ 0 => { mean: 0.18, standard_deviation: 0.38808793449160356 },
|
17
|
+
1 => { mean: 0.03966666666666667, standard_deviation: 0.1241245990920947 },
|
18
|
+
2 => { mean: 0.009000000000000001, standard_deviation: 0.04482391854210637 },
|
19
|
+
3 => { mean: 0.11, standard_deviation: 0.25134558515041244 },
|
20
|
+
4 => { mean: 2.44, standard_deviation: 1.0720950308167836 },
|
21
|
+
5 => { mean: 0.14, standard_deviation: 0.7001457574195914 } } }
|
22
22
|
end
|
23
|
+
end
|
23
24
|
end
|
@@ -1,73 +1,75 @@
|
|
1
1
|
require 'engtagger'
|
2
2
|
|
3
3
|
module KnownItemSearchClassifier
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
end
|
21
|
-
def feature_array
|
22
|
-
return [@mixed_case, @punctuation_ratio, @determiner_ratio, @proper_noun_ratio, @num_words, @numeric_count]
|
23
|
-
end
|
24
|
-
private
|
25
|
-
def is_mixed_case?
|
26
|
-
if @string =~ /[A-Z]/ and @string =~ /[a-z]/
|
27
|
-
return 1.0
|
28
|
-
end
|
29
|
-
return 0.0
|
30
|
-
end
|
31
|
-
def punctuation_ratio
|
32
|
-
num_punct = @tagged.scan(/\/PP/).size.to_f
|
33
|
-
return num_punct / @num_words
|
34
|
-
end
|
35
|
-
def determiner_ratio
|
36
|
-
num_det = @tagged.scan(/\/DET/).size.to_f
|
37
|
-
return num_det / @num_words
|
38
|
-
end
|
39
|
-
def numeric_count
|
40
|
-
return @string.scan(/[0-9]/).length
|
41
|
-
end
|
42
|
-
def proper_noun_ratio
|
43
|
-
num_prop_noun = @tagged.scan(/\/NNP/).size.to_f
|
44
|
-
return num_prop_noun / @num_words
|
45
|
-
end
|
46
|
-
def count_keywords
|
47
|
-
end
|
48
|
-
def check_against_known_titles
|
49
|
-
end
|
50
|
-
def count_keywords
|
51
|
-
keywords_to_match = ['journal', 'course', 'textbook']
|
52
|
-
num_keywords = 0
|
53
|
-
@query_string.split.each do |word|
|
54
|
-
if keywords_to_match.include? word.gsub(/[[:punct:]]/, '').downcase
|
55
|
-
num_keywords = num_keywords + 1
|
56
|
-
end
|
57
|
-
end
|
58
|
-
return num_keywords
|
59
|
-
end
|
60
|
-
def check_against_known_titles
|
61
|
-
known_titles = [
|
62
|
-
'fountainhead',
|
63
|
-
'salt sugar fat',
|
64
|
-
]
|
65
|
-
if known_titles.include? @query_string.downcase
|
66
|
-
return true
|
67
|
-
else
|
68
|
-
return false
|
69
|
-
end
|
70
|
-
end
|
4
|
+
class FeatureExtractor
|
5
|
+
def initialize(string)
|
6
|
+
@string = string
|
7
|
+
tagger = EngTagger.new
|
8
|
+
@tagged = tagger.get_readable string
|
9
|
+
@num_words = @tagged.scan(%r{/[A-Z]{2}}).size.to_f
|
10
|
+
|
11
|
+
@mixed_case = is_mixed_case?
|
12
|
+
@punctuation_ratio = punctuation_ratio
|
13
|
+
@determiner_ratio = determiner_ratio
|
14
|
+
@proper_noun_ratio = proper_noun_ratio
|
15
|
+
@numeric_count = numeric_count
|
16
|
+
|
17
|
+
# @num_keywords = count_keywords
|
18
|
+
# @refers_to_an_item_that_is_known = check_against_known_titles
|
71
19
|
end
|
72
|
-
end
|
73
20
|
|
21
|
+
def feature_array
|
22
|
+
[@mixed_case, @punctuation_ratio, @determiner_ratio, @proper_noun_ratio, @num_words, @numeric_count]
|
23
|
+
end
|
24
|
+
|
25
|
+
private
|
26
|
+
|
27
|
+
def is_mixed_case?
|
28
|
+
return 1.0 if @string =~ /[A-Z]/ and @string =~ /[a-z]/
|
29
|
+
|
30
|
+
0.0
|
31
|
+
end
|
32
|
+
|
33
|
+
def punctuation_ratio
|
34
|
+
num_punct = @tagged.scan(%r{/PP}).size.to_f
|
35
|
+
num_punct / @num_words
|
36
|
+
end
|
37
|
+
|
38
|
+
def determiner_ratio
|
39
|
+
num_det = @tagged.scan(%r{/DET}).size.to_f
|
40
|
+
num_det / @num_words
|
41
|
+
end
|
42
|
+
|
43
|
+
def numeric_count
|
44
|
+
@string.scan(/[0-9]/).length
|
45
|
+
end
|
46
|
+
|
47
|
+
def proper_noun_ratio
|
48
|
+
num_prop_noun = @tagged.scan(%r{/NNP}).size.to_f
|
49
|
+
num_prop_noun / @num_words
|
50
|
+
end
|
51
|
+
|
52
|
+
def count_keywords; end
|
53
|
+
|
54
|
+
def check_against_known_titles; end
|
55
|
+
|
56
|
+
def count_keywords
|
57
|
+
keywords_to_match = %w[journal course textbook]
|
58
|
+
num_keywords = 0
|
59
|
+
@query_string.split.each do |word|
|
60
|
+
num_keywords += 1 if keywords_to_match.include? word.gsub(/[[:punct:]]/, '').downcase
|
61
|
+
end
|
62
|
+
num_keywords
|
63
|
+
end
|
64
|
+
|
65
|
+
def check_against_known_titles
|
66
|
+
known_titles = [
|
67
|
+
'fountainhead',
|
68
|
+
'salt sugar fat'
|
69
|
+
]
|
70
|
+
return true if known_titles.include? @query_string.downcase
|
71
|
+
|
72
|
+
false
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
@@ -1,62 +1,49 @@
|
|
1
|
-
require 'coveralls'
|
2
|
-
Coveralls.wear!
|
3
1
|
require 'minitest/autorun'
|
4
2
|
require './lib/known_item_search_classifier'
|
5
3
|
|
6
|
-
|
7
4
|
class KnownItemSearchClassifierTest < Minitest::Test
|
8
|
-
|
5
|
+
classifier = KnownItemSearchClassifier::Classifier.new
|
9
6
|
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
7
|
+
known_item_training_set = [
|
8
|
+
# 'hobbit first edition', -- classifier incorrectly classifies this as unknown
|
9
|
+
# 'my soul is rested', -- classifier incorrectly classifies this as unknown
|
10
|
+
# 'new yorker', -- classifier incorrectly classifies this as unknown
|
11
|
+
# 'when harry met sally', -- classifier incorrectly classifies this as unknown
|
12
|
+
# '"neo tekunoroji"', -- classifier incorrectly classifies this as unknown
|
13
|
+
'99131236427206421',
|
14
|
+
'A decision making model for selecting start-up businesses in a government venture capital scheme',
|
15
|
+
# 'Dostoevsky Brothers Karamazov', -- classifier incorrectly classifies this as unknown
|
16
|
+
# 'Lawrence Classic American Literature', -- classifier incorrectly classifies this as unknown
|
17
|
+
# 'salt sugar fat', -- classifier incorrectly classifies this as unknown
|
18
|
+
'Robinson Ken. Creative Schools: The Grassroots Revolution That’s Transforming Eduction. Viking. 2015. Print',
|
19
|
+
'the inconvenient truth',
|
20
|
+
'Polarization: What Everyone Needs to Know',
|
21
|
+
'little house on the'
|
22
|
+
]
|
23
|
+
known_item_training_set.each do |query|
|
24
|
+
cleaned_up_query = query.gsub(/[[:punct:]]/, '').gsub(/[[:space:]]/, '_')
|
25
|
+
define_method("test_#{cleaned_up_query}_is_classified_as_known_item") do
|
26
|
+
assert_equal(:known, classifier.is_known_item_search?(query).to_sym)
|
29
27
|
end
|
28
|
+
end
|
30
29
|
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
"Professional baking ",
|
49
|
-
"concussions after the nfl",
|
50
|
-
"IVF the US",
|
51
|
-
"adoption children the US",
|
52
|
-
"Films for the hearing impaired",
|
53
|
-
"wolves and the ecosystem",
|
54
|
-
"dr. martin luther king",
|
55
|
-
]
|
56
|
-
unknown_item_training_set.each do |query|
|
57
|
-
cleaned_up_query = query.gsub(/[[:punct:]]/, '').gsub(/[[:space:]]/, '_')
|
58
|
-
define_method("test_#{cleaned_up_query}_is_not_false_positive") do
|
59
|
-
assert_equal(:unknown, classifier.is_known_item_search?(query))
|
60
|
-
end
|
30
|
+
unknown_item_training_set = [
|
31
|
+
'colonial mexico textiles',
|
32
|
+
'history of horses',
|
33
|
+
'medical expertise COVID',
|
34
|
+
'music and sexuality',
|
35
|
+
'paper industry',
|
36
|
+
'sun ra',
|
37
|
+
# 'concussions after the nfl', -- classifier incorrectly classifies this as known
|
38
|
+
'Professional baking ',
|
39
|
+
'Manos chatzidakis',
|
40
|
+
'whey protein',
|
41
|
+
'benefits of eating healthyhy'
|
42
|
+
]
|
43
|
+
unknown_item_training_set.each do |query|
|
44
|
+
cleaned_up_query = query.gsub(/[[:punct:]]/, '').gsub(/[[:space:]]/, '_')
|
45
|
+
define_method("test_#{cleaned_up_query}_is_not_false_positive") do
|
46
|
+
assert_equal(:unknown, classifier.is_known_item_search?(query).to_sym)
|
61
47
|
end
|
48
|
+
end
|
62
49
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: known_item_search_classifier
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jane Sandberg
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2024-11-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: engtagger
|
@@ -53,21 +53,35 @@ dependencies:
|
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '0'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
|
-
name:
|
56
|
+
name: rake
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
|
-
- -
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: rubocop
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
60
74
|
- !ruby/object:Gem::Version
|
61
|
-
version: 0
|
75
|
+
version: '0'
|
62
76
|
type: :development
|
63
77
|
prerelease: false
|
64
78
|
version_requirements: !ruby/object:Gem::Requirement
|
65
79
|
requirements:
|
66
|
-
- -
|
80
|
+
- - ">="
|
67
81
|
- !ruby/object:Gem::Version
|
68
|
-
version: 0
|
82
|
+
version: '0'
|
69
83
|
description: Classify search query strings
|
70
|
-
email:
|
84
|
+
email:
|
71
85
|
executables: []
|
72
86
|
extensions: []
|
73
87
|
extra_rdoc_files: []
|
@@ -81,7 +95,7 @@ homepage: https://github.com/sandbergja/known_item_search_classifier
|
|
81
95
|
licenses:
|
82
96
|
- MIT
|
83
97
|
metadata: {}
|
84
|
-
post_install_message:
|
98
|
+
post_install_message:
|
85
99
|
rdoc_options: []
|
86
100
|
require_paths:
|
87
101
|
- lib
|
@@ -89,16 +103,15 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
89
103
|
requirements:
|
90
104
|
- - ">="
|
91
105
|
- !ruby/object:Gem::Version
|
92
|
-
version:
|
106
|
+
version: 3.0.0
|
93
107
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
94
108
|
requirements:
|
95
109
|
- - ">="
|
96
110
|
- !ruby/object:Gem::Version
|
97
111
|
version: '0'
|
98
112
|
requirements: []
|
99
|
-
|
100
|
-
|
101
|
-
signing_key:
|
113
|
+
rubygems_version: 3.5.16
|
114
|
+
signing_key:
|
102
115
|
specification_version: 4
|
103
116
|
summary: A ruby gem that classifies search query strings as either known-item searches
|
104
117
|
or unknown-item searches
|