known_item_search_classifier 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: df03a8dc0661439a41c7d3366a49573f463a176f
4
+ data.tar.gz: 58713a770bba4a173adc4e88bb507f256b4eafc5
5
+ SHA512:
6
+ metadata.gz: 470565b3b2932df41a0d02b99048746b5f5e97de476b273a9648d473d63d7e73e44d038ff2b35ffd1f0bbd413960d35df5ca1ad8e8b8902038aedb57d64225f8
7
+ data.tar.gz: 7e81f8262925a653ab33c12844947f9df40b5510cf3e43d801d6a31cab7f3bf3a3de58dc9693ecbd71283ff309dc9bdb6490d1a6be25fe436726ffc09fea28e0
@@ -0,0 +1,53 @@
1
+ # Classifies search strings as either known-item searches or unknown-item searches
2
+ require 'gaussian_naive_bayes'
3
+
4
+ module KnownItemSearchClassifier
5
+ class Classifier
6
+ def initialize
7
+ set = DefaultTrainingSet.new
8
+ @default_training_set = GaussianNaiveBayes::Classifier.new set.categories_summaries, set.categories_probabilities
9
+ end
10
+ def is_known_item_search? query_string
11
+ return classify query_string
12
+ end
13
+ def train training_set
14
+ if defined? @custom_training_set
15
+ @custom_training_set = GaussianNaiveBayes::Learner.new
16
+ end
17
+ training_set.each do |query|
18
+ submit_vector query
19
+ end
20
+ end
21
+ def train_from_csv filename
22
+ if defined? @custom_training_set
23
+ @custom_training_set = GaussianNaiveBayes::Learner.new
24
+ end
25
+ csv = CSV.read(filename)
26
+ csv.each do |line|
27
+ submit_vector line
28
+ end
29
+ end
30
+
31
+ private
32
+ def classify string
33
+ f = FeatureExtractor.new string
34
+ feature_array = f.feature_array
35
+ if defined? @custom_training_set
36
+ classifier = @custom_training_set.classifier
37
+ query_class = classifier.classify(feature_array)
38
+ else
39
+ query_class = @default_training_set.classify(feature_array)
40
+ end
41
+ return query_class
42
+ if :known == query_class
43
+ return true
44
+ else
45
+ return false
46
+ end
47
+ end
48
+ def submit_vector arr
49
+ f = FeatureExtractor.new arr[0]
50
+ @custom_training_set.train f.feature_array, arr[1]
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,21 @@
1
+ module KnownItemSearchClassifier
2
+ class DefaultTrainingSet
3
+ attr_reader :categories_probabilities, :categories_summaries
4
+ def initialize
5
+ @categories_probabilities={:unknown=>0.78, :known=>0.22}
6
+ @categories_summaries= {
7
+ :unknown=>{
8
+ 0=>{:mean=>0.2564102564102564, :standard_deviation=>0.4394771815921655},
9
+ 1=>{:mean=>0.03418803418803419, :standard_deviation=>0.11344969312798027},
10
+ 2=>{:mean=>0.002564102564102564, :standard_deviation=>0.0226455406828919},
11
+ 3=>{:mean=>0.12991452991452992, :standard_deviation=>0.26648206508636013},
12
+ 4=>{:mean=>2.7948717948717947, :standard_deviation=>2.053561836691609}},
13
+ :known=>{
14
+ 0=>{:mean=>0.5454545454545454, :standard_deviation=>0.5096471914376255},
15
+ 1=>{:mean=>0.051659451659451655, :standard_deviation=>0.07957404805575267},
16
+ 2=>{:mean=>0.021248196248196245, :standard_deviation=>0.04412470821426937},
17
+ 3=>{:mean=>0.22550505050505054, :standard_deviation=>0.2520704609787127},
18
+ 4=>{:mean=>7.590909090909091, :standard_deviation=>5.770690236086651}}}
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,68 @@
1
+ require 'engtagger'
2
+
3
+ module KnownItemSearchClassifier
4
+ class FeatureExtractor
5
+ def initialize string
6
+ @string = string
7
+ tagger = EngTagger.new
8
+ @tagged = tagger.get_readable string
9
+ @num_words = @tagged.scan(/\/[A-Z]{2}/).size.to_f
10
+
11
+ @mixed_case = is_mixed_case?
12
+ @punctuation_ratio = punctuation_ratio
13
+ @determiner_ratio = determiner_ratio
14
+ @proper_noun_ratio = proper_noun_ratio
15
+
16
+ #@num_keywords = count_keywords
17
+ #@refers_to_an_item_that_is_known = check_against_known_titles
18
+
19
+ end
20
+ def feature_array
21
+ return [@mixed_case, @punctuation_ratio, @determiner_ratio, @proper_noun_ratio, @num_words]
22
+ end
23
+ private
24
+ def is_mixed_case?
25
+ if @string =~ /[A-Z]/ and @string =~ /[a-z]/
26
+ return 1.0
27
+ end
28
+ return 0.0
29
+ end
30
+ def punctuation_ratio
31
+ num_punct = @tagged.scan(/\/PP/).size.to_f
32
+ return num_punct / @num_words
33
+ end
34
+ def determiner_ratio
35
+ num_det = @tagged.scan(/\/DET/).size.to_f
36
+ return num_det / @num_words
37
+ end
38
+ def proper_noun_ratio
39
+ num_prop_noun = @tagged.scan(/\/NNP/).size.to_f
40
+ return num_prop_noun / @num_words
41
+ end
42
+ def count_keywords
43
+ end
44
+ def check_against_known_titles
45
+ end
46
+ def count_keywords
47
+ keywords_to_match = ['journal', 'course', 'textbook']
48
+ num_keywords = 0
49
+ @query_string.split.each do |word|
50
+ if keywords_to_match.include? word.gsub(/[[:punct:]]/, '').downcase
51
+ num_keywords = num_keywords + 1
52
+ end
53
+ end
54
+ return num_keywords
55
+ end
56
+ def check_against_known_titles
57
+ known_titles = [
58
+ 'salt sugar fat',
59
+ ]
60
+ if known_titles.include? @query_string.downcase
61
+ return true
62
+ else
63
+ return false
64
+ end
65
+ end
66
+ end
67
+ end
68
+
@@ -0,0 +1,3 @@
1
+ require 'known_item_search_classifier/default_training_set'
2
+ require 'known_item_search_classifier/feature_extractor'
3
+ require 'known_item_search_classifier/classifier'
@@ -0,0 +1,62 @@
1
+ require 'coveralls'
2
+ Coveralls.wear!
3
+ require 'minitest/autorun'
4
+ require './lib/known_item_search_classifier'
5
+
6
+
7
+ class KnownItemSearchClassifierTest < Minitest::Test
8
+ classifier = KnownItemSearchClassifier::Classifier.new
9
+
10
+ known_item_training_set = [
11
+ "little house on the",
12
+ "the inconvenient truth",
13
+ "the question of animal Culture by Kevin N Laland; Bennett G Galef ",
14
+ "Robinson Ken. Creative Schools: The Grassroots Revolution That’s Transforming Eduction. Viking. 2015. Print",
15
+ "The Boy in Zaquitos",
16
+ "The Mis-Education of the Negro",
17
+ "human relations interpersonal job-oriented skills",
18
+ "Research Methods for Business: A Skill-Building Approach Effectiveness of Instruction Performed through Computer-Assisted Activity Schedules on On-Schedule and Role-Play Skills of Children with Autism Spectrum Disorder",
19
+ "competency skills for the dental assiostant",
20
+ "Why did they kill?: Cambodia in the shadow of genocide",
21
+ "salt sugar fat",
22
+ "Making a Killing: Femicide, Free Trade, and La Frontera",
23
+ ]
24
+ known_item_training_set.each do |query|
25
+ cleaned_up_query = query.gsub(/[[:punct:]]/, '').gsub(/[[:space:]]/, '_')
26
+ define_method("test_#{cleaned_up_query}_is_classified_as_known_item") do
27
+ assert_equal(:known, classifier.is_known_item_search?(query))
28
+ end
29
+ end
30
+
31
+ unknown_item_training_set = [
32
+ "earthworms",
33
+ "network security",
34
+ "work stress",
35
+ "mummies",
36
+ "benefits of eating healthyhy",
37
+ "benefits of eating healthy",
38
+ "megadosing vitamin c",
39
+ "nutrition",
40
+ "penquin",
41
+ "bananas",
42
+ "food sourcing",
43
+ "whey protein",
44
+ "exotic animals",
45
+ "sweet home oregon",
46
+ "taylor swift",
47
+ "catholicism",
48
+ "Professional baking ",
49
+ "concussions after the nfl",
50
+ "IVF the US",
51
+ "adoption children the US",
52
+ "Films for the hearing impaired",
53
+ "wolves and the ecosystem",
54
+ "dr. martin luther king",
55
+ ]
56
+ unknown_item_training_set.each do |query|
57
+ cleaned_up_query = query.gsub(/[[:punct:]]/, '').gsub(/[[:space:]]/, '_')
58
+ define_method("test_#{cleaned_up_query}_is_not_false_positive") do
59
+ assert_equal(:unknown, classifier.is_known_item_search?(query))
60
+ end
61
+ end
62
+ end
metadata ADDED
@@ -0,0 +1,106 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: known_item_search_classifier
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Jane Sandberg
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-11-13 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: engtagger
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: 0.2.1
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: 0.2.1
27
+ - !ruby/object:Gem::Dependency
28
+ name: gaussian_naive_bayes
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: 0.1.1
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: 0.1.1
41
+ - !ruby/object:Gem::Dependency
42
+ name: minitest
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: coveralls
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - '='
60
+ - !ruby/object:Gem::Version
61
+ version: 0.7.0
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - '='
67
+ - !ruby/object:Gem::Version
68
+ version: 0.7.0
69
+ description: Classify search query strings
70
+ email: sandbej@linnbenton.edu
71
+ executables: []
72
+ extensions: []
73
+ extra_rdoc_files: []
74
+ files:
75
+ - lib/known_item_search_classifier.rb
76
+ - lib/known_item_search_classifier/classifier.rb
77
+ - lib/known_item_search_classifier/default_training_set.rb
78
+ - lib/known_item_search_classifier/feature_extractor.rb
79
+ - test/known_item_search_classifier_test.rb
80
+ homepage: https://github.com/sandbergja/known_item_search_classifier
81
+ licenses:
82
+ - MIT
83
+ metadata: {}
84
+ post_install_message:
85
+ rdoc_options: []
86
+ require_paths:
87
+ - lib
88
+ required_ruby_version: !ruby/object:Gem::Requirement
89
+ requirements:
90
+ - - ">="
91
+ - !ruby/object:Gem::Version
92
+ version: '0'
93
+ required_rubygems_version: !ruby/object:Gem::Requirement
94
+ requirements:
95
+ - - ">="
96
+ - !ruby/object:Gem::Version
97
+ version: '0'
98
+ requirements: []
99
+ rubyforge_project:
100
+ rubygems_version: 2.5.1
101
+ signing_key:
102
+ specification_version: 4
103
+ summary: A ruby gem that classifies search query strings as either known-item searches
104
+ or unknown-item searches
105
+ test_files:
106
+ - test/known_item_search_classifier_test.rb