known_item_search_classifier 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/known_item_search_classifier/classifier.rb +53 -0
- data/lib/known_item_search_classifier/default_training_set.rb +21 -0
- data/lib/known_item_search_classifier/feature_extractor.rb +68 -0
- data/lib/known_item_search_classifier.rb +3 -0
- data/test/known_item_search_classifier_test.rb +62 -0
- metadata +106 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: df03a8dc0661439a41c7d3366a49573f463a176f
|
4
|
+
data.tar.gz: 58713a770bba4a173adc4e88bb507f256b4eafc5
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 470565b3b2932df41a0d02b99048746b5f5e97de476b273a9648d473d63d7e73e44d038ff2b35ffd1f0bbd413960d35df5ca1ad8e8b8902038aedb57d64225f8
|
7
|
+
data.tar.gz: 7e81f8262925a653ab33c12844947f9df40b5510cf3e43d801d6a31cab7f3bf3a3de58dc9693ecbd71283ff309dc9bdb6490d1a6be25fe436726ffc09fea28e0
|
@@ -0,0 +1,53 @@
|
|
1
|
+
# Classifies search strings as either known-item searches or unknown-item searches
|
2
|
+
require 'gaussian_naive_bayes'
|
3
|
+
|
4
|
+
module KnownItemSearchClassifier
|
5
|
+
class Classifier
|
6
|
+
def initialize
|
7
|
+
set = DefaultTrainingSet.new
|
8
|
+
@default_training_set = GaussianNaiveBayes::Classifier.new set.categories_summaries, set.categories_probabilities
|
9
|
+
end
|
10
|
+
def is_known_item_search? query_string
|
11
|
+
return classify query_string
|
12
|
+
end
|
13
|
+
def train training_set
|
14
|
+
if defined? @custom_training_set
|
15
|
+
@custom_training_set = GaussianNaiveBayes::Learner.new
|
16
|
+
end
|
17
|
+
training_set.each do |query|
|
18
|
+
submit_vector query
|
19
|
+
end
|
20
|
+
end
|
21
|
+
def train_from_csv filename
|
22
|
+
if defined? @custom_training_set
|
23
|
+
@custom_training_set = GaussianNaiveBayes::Learner.new
|
24
|
+
end
|
25
|
+
csv = CSV.read(filename)
|
26
|
+
csv.each do |line|
|
27
|
+
submit_vector line
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
private
|
32
|
+
def classify string
|
33
|
+
f = FeatureExtractor.new string
|
34
|
+
feature_array = f.feature_array
|
35
|
+
if defined? @custom_training_set
|
36
|
+
classifier = @custom_training_set.classifier
|
37
|
+
query_class = classifier.classify(feature_array)
|
38
|
+
else
|
39
|
+
query_class = @default_training_set.classify(feature_array)
|
40
|
+
end
|
41
|
+
return query_class
|
42
|
+
if :known == query_class
|
43
|
+
return true
|
44
|
+
else
|
45
|
+
return false
|
46
|
+
end
|
47
|
+
end
|
48
|
+
def submit_vector arr
|
49
|
+
f = FeatureExtractor.new arr[0]
|
50
|
+
@custom_training_set.train f.feature_array, arr[1]
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
module KnownItemSearchClassifier
|
2
|
+
class DefaultTrainingSet
|
3
|
+
attr_reader :categories_probabilities, :categories_summaries
|
4
|
+
def initialize
|
5
|
+
@categories_probabilities={:unknown=>0.78, :known=>0.22}
|
6
|
+
@categories_summaries= {
|
7
|
+
:unknown=>{
|
8
|
+
0=>{:mean=>0.2564102564102564, :standard_deviation=>0.4394771815921655},
|
9
|
+
1=>{:mean=>0.03418803418803419, :standard_deviation=>0.11344969312798027},
|
10
|
+
2=>{:mean=>0.002564102564102564, :standard_deviation=>0.0226455406828919},
|
11
|
+
3=>{:mean=>0.12991452991452992, :standard_deviation=>0.26648206508636013},
|
12
|
+
4=>{:mean=>2.7948717948717947, :standard_deviation=>2.053561836691609}},
|
13
|
+
:known=>{
|
14
|
+
0=>{:mean=>0.5454545454545454, :standard_deviation=>0.5096471914376255},
|
15
|
+
1=>{:mean=>0.051659451659451655, :standard_deviation=>0.07957404805575267},
|
16
|
+
2=>{:mean=>0.021248196248196245, :standard_deviation=>0.04412470821426937},
|
17
|
+
3=>{:mean=>0.22550505050505054, :standard_deviation=>0.2520704609787127},
|
18
|
+
4=>{:mean=>7.590909090909091, :standard_deviation=>5.770690236086651}}}
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
require 'engtagger'
|
2
|
+
|
3
|
+
module KnownItemSearchClassifier
|
4
|
+
class FeatureExtractor
|
5
|
+
def initialize string
|
6
|
+
@string = string
|
7
|
+
tagger = EngTagger.new
|
8
|
+
@tagged = tagger.get_readable string
|
9
|
+
@num_words = @tagged.scan(/\/[A-Z]{2}/).size.to_f
|
10
|
+
|
11
|
+
@mixed_case = is_mixed_case?
|
12
|
+
@punctuation_ratio = punctuation_ratio
|
13
|
+
@determiner_ratio = determiner_ratio
|
14
|
+
@proper_noun_ratio = proper_noun_ratio
|
15
|
+
|
16
|
+
#@num_keywords = count_keywords
|
17
|
+
#@refers_to_an_item_that_is_known = check_against_known_titles
|
18
|
+
|
19
|
+
end
|
20
|
+
def feature_array
|
21
|
+
return [@mixed_case, @punctuation_ratio, @determiner_ratio, @proper_noun_ratio, @num_words]
|
22
|
+
end
|
23
|
+
private
|
24
|
+
def is_mixed_case?
|
25
|
+
if @string =~ /[A-Z]/ and @string =~ /[a-z]/
|
26
|
+
return 1.0
|
27
|
+
end
|
28
|
+
return 0.0
|
29
|
+
end
|
30
|
+
def punctuation_ratio
|
31
|
+
num_punct = @tagged.scan(/\/PP/).size.to_f
|
32
|
+
return num_punct / @num_words
|
33
|
+
end
|
34
|
+
def determiner_ratio
|
35
|
+
num_det = @tagged.scan(/\/DET/).size.to_f
|
36
|
+
return num_det / @num_words
|
37
|
+
end
|
38
|
+
def proper_noun_ratio
|
39
|
+
num_prop_noun = @tagged.scan(/\/NNP/).size.to_f
|
40
|
+
return num_prop_noun / @num_words
|
41
|
+
end
|
42
|
+
def count_keywords
|
43
|
+
end
|
44
|
+
def check_against_known_titles
|
45
|
+
end
|
46
|
+
def count_keywords
|
47
|
+
keywords_to_match = ['journal', 'course', 'textbook']
|
48
|
+
num_keywords = 0
|
49
|
+
@query_string.split.each do |word|
|
50
|
+
if keywords_to_match.include? word.gsub(/[[:punct:]]/, '').downcase
|
51
|
+
num_keywords = num_keywords + 1
|
52
|
+
end
|
53
|
+
end
|
54
|
+
return num_keywords
|
55
|
+
end
|
56
|
+
def check_against_known_titles
|
57
|
+
known_titles = [
|
58
|
+
'salt sugar fat',
|
59
|
+
]
|
60
|
+
if known_titles.include? @query_string.downcase
|
61
|
+
return true
|
62
|
+
else
|
63
|
+
return false
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
@@ -0,0 +1,62 @@
|
|
1
|
+
require 'coveralls'
|
2
|
+
Coveralls.wear!
|
3
|
+
require 'minitest/autorun'
|
4
|
+
require './lib/known_item_search_classifier'
|
5
|
+
|
6
|
+
|
7
|
+
class KnownItemSearchClassifierTest < Minitest::Test
|
8
|
+
classifier = KnownItemSearchClassifier::Classifier.new
|
9
|
+
|
10
|
+
known_item_training_set = [
|
11
|
+
"little house on the",
|
12
|
+
"the inconvenient truth",
|
13
|
+
"the question of animal Culture by Kevin N Laland; Bennett G Galef ",
|
14
|
+
"Robinson Ken. Creative Schools: The Grassroots Revolution That’s Transforming Eduction. Viking. 2015. Print",
|
15
|
+
"The Boy in Zaquitos",
|
16
|
+
"The Mis-Education of the Negro",
|
17
|
+
"human relations interpersonal job-oriented skills",
|
18
|
+
"Research Methods for Business: A Skill-Building Approach Effectiveness of Instruction Performed through Computer-Assisted Activity Schedules on On-Schedule and Role-Play Skills of Children with Autism Spectrum Disorder",
|
19
|
+
"competency skills for the dental assiostant",
|
20
|
+
"Why did they kill?: Cambodia in the shadow of genocide",
|
21
|
+
"salt sugar fat",
|
22
|
+
"Making a Killing: Femicide, Free Trade, and La Frontera",
|
23
|
+
]
|
24
|
+
known_item_training_set.each do |query|
|
25
|
+
cleaned_up_query = query.gsub(/[[:punct:]]/, '').gsub(/[[:space:]]/, '_')
|
26
|
+
define_method("test_#{cleaned_up_query}_is_classified_as_known_item") do
|
27
|
+
assert_equal(:known, classifier.is_known_item_search?(query))
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
unknown_item_training_set = [
|
32
|
+
"earthworms",
|
33
|
+
"network security",
|
34
|
+
"work stress",
|
35
|
+
"mummies",
|
36
|
+
"benefits of eating healthyhy",
|
37
|
+
"benefits of eating healthy",
|
38
|
+
"megadosing vitamin c",
|
39
|
+
"nutrition",
|
40
|
+
"penquin",
|
41
|
+
"bananas",
|
42
|
+
"food sourcing",
|
43
|
+
"whey protein",
|
44
|
+
"exotic animals",
|
45
|
+
"sweet home oregon",
|
46
|
+
"taylor swift",
|
47
|
+
"catholicism",
|
48
|
+
"Professional baking ",
|
49
|
+
"concussions after the nfl",
|
50
|
+
"IVF the US",
|
51
|
+
"adoption children the US",
|
52
|
+
"Films for the hearing impaired",
|
53
|
+
"wolves and the ecosystem",
|
54
|
+
"dr. martin luther king",
|
55
|
+
]
|
56
|
+
unknown_item_training_set.each do |query|
|
57
|
+
cleaned_up_query = query.gsub(/[[:punct:]]/, '').gsub(/[[:space:]]/, '_')
|
58
|
+
define_method("test_#{cleaned_up_query}_is_not_false_positive") do
|
59
|
+
assert_equal(:unknown, classifier.is_known_item_search?(query))
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
metadata
ADDED
@@ -0,0 +1,106 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: known_item_search_classifier
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Jane Sandberg
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2016-11-13 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: engtagger
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 0.2.1
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 0.2.1
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: gaussian_naive_bayes
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 0.1.1
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 0.1.1
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: minitest
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: coveralls
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - '='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: 0.7.0
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - '='
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: 0.7.0
|
69
|
+
description: Classify search query strings
|
70
|
+
email: sandbej@linnbenton.edu
|
71
|
+
executables: []
|
72
|
+
extensions: []
|
73
|
+
extra_rdoc_files: []
|
74
|
+
files:
|
75
|
+
- lib/known_item_search_classifier.rb
|
76
|
+
- lib/known_item_search_classifier/classifier.rb
|
77
|
+
- lib/known_item_search_classifier/default_training_set.rb
|
78
|
+
- lib/known_item_search_classifier/feature_extractor.rb
|
79
|
+
- test/known_item_search_classifier_test.rb
|
80
|
+
homepage: https://github.com/sandbergja/known_item_search_classifier
|
81
|
+
licenses:
|
82
|
+
- MIT
|
83
|
+
metadata: {}
|
84
|
+
post_install_message:
|
85
|
+
rdoc_options: []
|
86
|
+
require_paths:
|
87
|
+
- lib
|
88
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
89
|
+
requirements:
|
90
|
+
- - ">="
|
91
|
+
- !ruby/object:Gem::Version
|
92
|
+
version: '0'
|
93
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
94
|
+
requirements:
|
95
|
+
- - ">="
|
96
|
+
- !ruby/object:Gem::Version
|
97
|
+
version: '0'
|
98
|
+
requirements: []
|
99
|
+
rubyforge_project:
|
100
|
+
rubygems_version: 2.5.1
|
101
|
+
signing_key:
|
102
|
+
specification_version: 4
|
103
|
+
summary: A ruby gem that classifies search query strings as either known-item searches
|
104
|
+
or unknown-item searches
|
105
|
+
test_files:
|
106
|
+
- test/known_item_search_classifier_test.rb
|