known_item_search_classifier 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: df03a8dc0661439a41c7d3366a49573f463a176f
4
+ data.tar.gz: 58713a770bba4a173adc4e88bb507f256b4eafc5
5
+ SHA512:
6
+ metadata.gz: 470565b3b2932df41a0d02b99048746b5f5e97de476b273a9648d473d63d7e73e44d038ff2b35ffd1f0bbd413960d35df5ca1ad8e8b8902038aedb57d64225f8
7
+ data.tar.gz: 7e81f8262925a653ab33c12844947f9df40b5510cf3e43d801d6a31cab7f3bf3a3de58dc9693ecbd71283ff309dc9bdb6490d1a6be25fe436726ffc09fea28e0
@@ -0,0 +1,53 @@
1
+ # Classifies search strings as either known-item searches or unknown-item searches
2
+ require 'gaussian_naive_bayes'
3
+
4
+ module KnownItemSearchClassifier
5
+ class Classifier
6
+ def initialize
7
+ set = DefaultTrainingSet.new
8
+ @default_training_set = GaussianNaiveBayes::Classifier.new set.categories_summaries, set.categories_probabilities
9
+ end
10
+ def is_known_item_search? query_string
11
+ return classify query_string
12
+ end
13
+ def train training_set
14
+ if defined? @custom_training_set
15
+ @custom_training_set = GaussianNaiveBayes::Learner.new
16
+ end
17
+ training_set.each do |query|
18
+ submit_vector query
19
+ end
20
+ end
21
+ def train_from_csv filename
22
+ if defined? @custom_training_set
23
+ @custom_training_set = GaussianNaiveBayes::Learner.new
24
+ end
25
+ csv = CSV.read(filename)
26
+ csv.each do |line|
27
+ submit_vector line
28
+ end
29
+ end
30
+
31
+ private
32
+ def classify string
33
+ f = FeatureExtractor.new string
34
+ feature_array = f.feature_array
35
+ if defined? @custom_training_set
36
+ classifier = @custom_training_set.classifier
37
+ query_class = classifier.classify(feature_array)
38
+ else
39
+ query_class = @default_training_set.classify(feature_array)
40
+ end
41
+ return query_class
42
+ if :known == query_class
43
+ return true
44
+ else
45
+ return false
46
+ end
47
+ end
48
+ def submit_vector arr
49
+ f = FeatureExtractor.new arr[0]
50
+ @custom_training_set.train f.feature_array, arr[1]
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,21 @@
1
+ module KnownItemSearchClassifier
2
+ class DefaultTrainingSet
3
+ attr_reader :categories_probabilities, :categories_summaries
4
+ def initialize
5
+ @categories_probabilities={:unknown=>0.78, :known=>0.22}
6
+ @categories_summaries= {
7
+ :unknown=>{
8
+ 0=>{:mean=>0.2564102564102564, :standard_deviation=>0.4394771815921655},
9
+ 1=>{:mean=>0.03418803418803419, :standard_deviation=>0.11344969312798027},
10
+ 2=>{:mean=>0.002564102564102564, :standard_deviation=>0.0226455406828919},
11
+ 3=>{:mean=>0.12991452991452992, :standard_deviation=>0.26648206508636013},
12
+ 4=>{:mean=>2.7948717948717947, :standard_deviation=>2.053561836691609}},
13
+ :known=>{
14
+ 0=>{:mean=>0.5454545454545454, :standard_deviation=>0.5096471914376255},
15
+ 1=>{:mean=>0.051659451659451655, :standard_deviation=>0.07957404805575267},
16
+ 2=>{:mean=>0.021248196248196245, :standard_deviation=>0.04412470821426937},
17
+ 3=>{:mean=>0.22550505050505054, :standard_deviation=>0.2520704609787127},
18
+ 4=>{:mean=>7.590909090909091, :standard_deviation=>5.770690236086651}}}
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,68 @@
1
+ require 'engtagger'
2
+
3
+ module KnownItemSearchClassifier
4
+ class FeatureExtractor
5
+ def initialize string
6
+ @string = string
7
+ tagger = EngTagger.new
8
+ @tagged = tagger.get_readable string
9
+ @num_words = @tagged.scan(/\/[A-Z]{2}/).size.to_f
10
+
11
+ @mixed_case = is_mixed_case?
12
+ @punctuation_ratio = punctuation_ratio
13
+ @determiner_ratio = determiner_ratio
14
+ @proper_noun_ratio = proper_noun_ratio
15
+
16
+ #@num_keywords = count_keywords
17
+ #@refers_to_an_item_that_is_known = check_against_known_titles
18
+
19
+ end
20
+ def feature_array
21
+ return [@mixed_case, @punctuation_ratio, @determiner_ratio, @proper_noun_ratio, @num_words]
22
+ end
23
+ private
24
+ def is_mixed_case?
25
+ if @string =~ /[A-Z]/ and @string =~ /[a-z]/
26
+ return 1.0
27
+ end
28
+ return 0.0
29
+ end
30
+ def punctuation_ratio
31
+ num_punct = @tagged.scan(/\/PP/).size.to_f
32
+ return num_punct / @num_words
33
+ end
34
+ def determiner_ratio
35
+ num_det = @tagged.scan(/\/DET/).size.to_f
36
+ return num_det / @num_words
37
+ end
38
+ def proper_noun_ratio
39
+ num_prop_noun = @tagged.scan(/\/NNP/).size.to_f
40
+ return num_prop_noun / @num_words
41
+ end
42
+ def count_keywords
43
+ end
44
+ def check_against_known_titles
45
+ end
46
+ def count_keywords
47
+ keywords_to_match = ['journal', 'course', 'textbook']
48
+ num_keywords = 0
49
+ @query_string.split.each do |word|
50
+ if keywords_to_match.include? word.gsub(/[[:punct:]]/, '').downcase
51
+ num_keywords = num_keywords + 1
52
+ end
53
+ end
54
+ return num_keywords
55
+ end
56
+ def check_against_known_titles
57
+ known_titles = [
58
+ 'salt sugar fat',
59
+ ]
60
+ if known_titles.include? @query_string.downcase
61
+ return true
62
+ else
63
+ return false
64
+ end
65
+ end
66
+ end
67
+ end
68
+
@@ -0,0 +1,3 @@
1
+ require 'known_item_search_classifier/default_training_set'
2
+ require 'known_item_search_classifier/feature_extractor'
3
+ require 'known_item_search_classifier/classifier'
@@ -0,0 +1,62 @@
1
+ require 'coveralls'
2
+ Coveralls.wear!
3
+ require 'minitest/autorun'
4
+ require './lib/known_item_search_classifier'
5
+
6
+
7
+ class KnownItemSearchClassifierTest < Minitest::Test
8
+ classifier = KnownItemSearchClassifier::Classifier.new
9
+
10
+ known_item_training_set = [
11
+ "little house on the",
12
+ "the inconvenient truth",
13
+ "the question of animal Culture by Kevin N Laland; Bennett G Galef ",
14
+ "Robinson Ken. Creative Schools: The Grassroots Revolution That’s Transforming Eduction. Viking. 2015. Print",
15
+ "The Boy in Zaquitos",
16
+ "The Mis-Education of the Negro",
17
+ "human relations interpersonal job-oriented skills",
18
+ "Research Methods for Business: A Skill-Building Approach Effectiveness of Instruction Performed through Computer-Assisted Activity Schedules on On-Schedule and Role-Play Skills of Children with Autism Spectrum Disorder",
19
+ "competency skills for the dental assiostant",
20
+ "Why did they kill?: Cambodia in the shadow of genocide",
21
+ "salt sugar fat",
22
+ "Making a Killing: Femicide, Free Trade, and La Frontera",
23
+ ]
24
+ known_item_training_set.each do |query|
25
+ cleaned_up_query = query.gsub(/[[:punct:]]/, '').gsub(/[[:space:]]/, '_')
26
+ define_method("test_#{cleaned_up_query}_is_classified_as_known_item") do
27
+ assert_equal(:known, classifier.is_known_item_search?(query))
28
+ end
29
+ end
30
+
31
+ unknown_item_training_set = [
32
+ "earthworms",
33
+ "network security",
34
+ "work stress",
35
+ "mummies",
36
+ "benefits of eating healthyhy",
37
+ "benefits of eating healthy",
38
+ "megadosing vitamin c",
39
+ "nutrition",
40
+ "penquin",
41
+ "bananas",
42
+ "food sourcing",
43
+ "whey protein",
44
+ "exotic animals",
45
+ "sweet home oregon",
46
+ "taylor swift",
47
+ "catholicism",
48
+ "Professional baking ",
49
+ "concussions after the nfl",
50
+ "IVF the US",
51
+ "adoption children the US",
52
+ "Films for the hearing impaired",
53
+ "wolves and the ecosystem",
54
+ "dr. martin luther king",
55
+ ]
56
+ unknown_item_training_set.each do |query|
57
+ cleaned_up_query = query.gsub(/[[:punct:]]/, '').gsub(/[[:space:]]/, '_')
58
+ define_method("test_#{cleaned_up_query}_is_not_false_positive") do
59
+ assert_equal(:unknown, classifier.is_known_item_search?(query))
60
+ end
61
+ end
62
+ end
metadata ADDED
@@ -0,0 +1,106 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: known_item_search_classifier
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Jane Sandberg
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-11-13 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: engtagger
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: 0.2.1
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: 0.2.1
27
+ - !ruby/object:Gem::Dependency
28
+ name: gaussian_naive_bayes
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: 0.1.1
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: 0.1.1
41
+ - !ruby/object:Gem::Dependency
42
+ name: minitest
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: coveralls
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - '='
60
+ - !ruby/object:Gem::Version
61
+ version: 0.7.0
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - '='
67
+ - !ruby/object:Gem::Version
68
+ version: 0.7.0
69
+ description: Classify search query strings
70
+ email: sandbej@linnbenton.edu
71
+ executables: []
72
+ extensions: []
73
+ extra_rdoc_files: []
74
+ files:
75
+ - lib/known_item_search_classifier.rb
76
+ - lib/known_item_search_classifier/classifier.rb
77
+ - lib/known_item_search_classifier/default_training_set.rb
78
+ - lib/known_item_search_classifier/feature_extractor.rb
79
+ - test/known_item_search_classifier_test.rb
80
+ homepage: https://github.com/sandbergja/known_item_search_classifier
81
+ licenses:
82
+ - MIT
83
+ metadata: {}
84
+ post_install_message:
85
+ rdoc_options: []
86
+ require_paths:
87
+ - lib
88
+ required_ruby_version: !ruby/object:Gem::Requirement
89
+ requirements:
90
+ - - ">="
91
+ - !ruby/object:Gem::Version
92
+ version: '0'
93
+ required_rubygems_version: !ruby/object:Gem::Requirement
94
+ requirements:
95
+ - - ">="
96
+ - !ruby/object:Gem::Version
97
+ version: '0'
98
+ requirements: []
99
+ rubyforge_project:
100
+ rubygems_version: 2.5.1
101
+ signing_key:
102
+ specification_version: 4
103
+ summary: A ruby gem that classifies search query strings as either known-item searches
104
+ or unknown-item searches
105
+ test_files:
106
+ - test/known_item_search_classifier_test.rb