known_item_search_classifier 0.2.1 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 89a7295765e4fe6c6503152b733a79b46151e8bd
4
- data.tar.gz: 4b541eaf18b0c8a8d74ad8f4525ad28888dcf652
2
+ SHA256:
3
+ metadata.gz: a95567708e0b56c79c3a102e1d7c72e493e5660518de3b24c8fc42a691609938
4
+ data.tar.gz: 70ea59d9d7c0451b3d454506e578c2761c12e7d226edca852431c76bee1a9456
5
5
  SHA512:
6
- metadata.gz: 4166fa3994f73f19828b9c7c53ee6d8678cdd284cb39189b42abd03bfd69b0b2cd4849105406d7d0c5d49f85e2a9e631808ea3bee23a6374d15bb63735324740
7
- data.tar.gz: 01d20e0df51bc086e5aeb6b9348cf2543bc1dbb892dbf246d13158e72a36024b6344c7ce52a4bee60a6d3537e262b6205ef2b8168deb95d0815dc4997c6be179
6
+ metadata.gz: 4fb37b0932e9e0c32f9ec0ef6bdc563bd7e4e4cca5f401186daec4ae8d3be112b96478a9f04cf715620144e5db30e340959db808d5cc99841360dd72d480984d
7
+ data.tar.gz: 96777f8fa22a9208dc4e22a76a4c74dd57785c32a52de386d0e78678880a0d0faa020e0390fe8d1275c2cc3326cdf6cfa2fae6dcee4bbe299c66c79918a696fd
@@ -3,52 +3,54 @@ require 'csv'
3
3
  require 'gaussian_naive_bayes'
4
4
 
5
5
  module KnownItemSearchClassifier
6
- class Classifier
7
- def initialize
8
- set = DefaultTrainingSet.new
9
- @default_training_set = GaussianNaiveBayes::Classifier.new set.categories_summaries, set.categories_probabilities
10
- end
11
- def is_known_item_search? query_string
12
- return classify query_string
13
- end
14
- def train training_set
15
- unless defined? @custom_training_set
16
- @custom_training_set = GaussianNaiveBayes::Learner.new
17
- end
18
- training_set.each do |query|
19
- submit_vector query
20
- end
21
- end
22
- def train_from_csv filename
23
- unless defined? @custom_training_set
24
- @custom_training_set = GaussianNaiveBayes::Learner.new
25
- end
26
- csv = ::CSV.read(filename)
27
- csv.each do |line|
28
- submit_vector line
29
- end
30
- end
6
+ class Classifier
7
+ def initialize
8
+ set = DefaultTrainingSet.new
9
+ @default_training_set = GaussianNaiveBayes::Classifier.new set.categories_summaries,
10
+ set.categories_probabilities
11
+ end
12
+
13
+ def is_known_item_search?(query_string)
14
+ classify query_string
15
+ end
16
+
17
+ def train(training_set)
18
+ @custom_training_set = GaussianNaiveBayes::Learner.new unless defined? @custom_training_set
19
+ training_set.each do |query|
20
+ submit_vector query
21
+ end
22
+ end
23
+
24
+ def train_from_csv(filename)
25
+ @custom_training_set = GaussianNaiveBayes::Learner.new unless defined? @custom_training_set
26
+ csv = ::CSV.read(filename)
27
+ csv.each do |line|
28
+ submit_vector line
29
+ end
30
+ end
31
31
 
32
32
  private
33
- def classify string
34
- f = FeatureExtractor.new string
35
- feature_array = f.feature_array
36
- if defined? @custom_training_set
37
- classifier = @custom_training_set.classifier
38
- query_class = classifier.classify(feature_array)
39
- else
40
- query_class = @default_training_set.classify(feature_array)
41
- end
42
- return query_class
43
- if :known == query_class
44
- return true
45
- else
46
- return false
47
- end
48
- end
49
- def submit_vector arr
50
- f = FeatureExtractor.new arr[0]
51
- @custom_training_set.train f.feature_array, arr[1]
52
- end
33
+
34
+ attr_reader :custom_tr
35
+
36
+ def classify(string)
37
+ f = FeatureExtractor.new string
38
+ feature_array = f.feature_array
39
+ if defined? @custom_training_set
40
+ classifier = @custom_training_set.classifier
41
+ query_class = classifier.classify(feature_array)
42
+ else
43
+ query_class = @default_training_set.classify(feature_array)
44
+ end
45
+ return query_class
46
+ return true if :known == query_class
47
+
48
+ false
49
+ end
50
+
51
+ def submit_vector(arr)
52
+ f = FeatureExtractor.new arr[0]
53
+ @custom_training_set.train f.feature_array, arr[1]
53
54
  end
55
+ end
54
56
  end
@@ -1,23 +1,24 @@
1
1
  module KnownItemSearchClassifier
2
- class DefaultTrainingSet
3
- attr_reader :categories_probabilities, :categories_summaries
4
- def initialize
5
- @categories_probabilities={:unknown=>0.835, :known=>0.165}
6
- @categories_summaries= {
7
- :unknown=>{
8
- 0=>{:mean=>0.32335329341317365, :standard_deviation=>0.4691630728112455},
9
- 1=>{:mean=>0.01867693185058454, :standard_deviation=>0.0856521002382124},
10
- 2=>{:mean=>0.0024950099800399197, :standard_deviation=>0.02318575984424029},
11
- 3=>{:mean=>0.18252067293983462, :standard_deviation=>0.32649287803592736},
12
- 4=>{:mean=>2.2634730538922154, :standard_deviation=>1.3497147972472143},
13
- 5=>{:mean=>0.20958083832335328, :standard_deviation=>1.2933208182456999}},
14
- :known=>{
15
- 0=>{:mean=>0.3333333333333333, :standard_deviation=>0.478713553878169},
16
- 1=>{:mean=>0.034283854046699896, :standard_deviation=>0.07844034834013752},
17
- 2=>{:mean=>0.06397250092902267, :standard_deviation=>0.10673099909054994},
18
- 3=>{:mean=>0.06715805055726004, :standard_deviation=>0.1488979015655406},
19
- 4=>{:mean=>4.696969696969697, :standard_deviation=>4.9591131294116515},
20
- 5=>{:mean=>3.9393939393939394, :standard_deviation=>5.606577576491037}}}
21
- end
2
+ class DefaultTrainingSet
3
+ attr_reader :categories_probabilities, :categories_summaries
4
+
5
+ def initialize
6
+ @categories_probabilities = { 'known' => 0.3333333333333333, 'unknown' => 0.6666666666666666 }
7
+ @categories_summaries =
8
+ { 'known' =>
9
+ { 0 => { mean: 0.6, standard_deviation: 0.5 },
10
+ 1 => { mean: 0.0516060606060606, standard_deviation: 0.09910312916958242 },
11
+ 2 => { mean: 0.06633333333333333, standard_deviation: 0.13412266359153804 },
12
+ 3 => { mean: 0.2575454545454545, standard_deviation: 0.27976953051588926 },
13
+ 4 => { mean: 4.76, standard_deviation: 3.8867295592395754 },
14
+ 5 => { mean: 3.48, standard_deviation: 4.91697739131132 } },
15
+ 'unknown' =>
16
+ { 0 => { mean: 0.18, standard_deviation: 0.38808793449160356 },
17
+ 1 => { mean: 0.03966666666666667, standard_deviation: 0.1241245990920947 },
18
+ 2 => { mean: 0.009000000000000001, standard_deviation: 0.04482391854210637 },
19
+ 3 => { mean: 0.11, standard_deviation: 0.25134558515041244 },
20
+ 4 => { mean: 2.44, standard_deviation: 1.0720950308167836 },
21
+ 5 => { mean: 0.14, standard_deviation: 0.7001457574195914 } } }
22
22
  end
23
+ end
23
24
  end
@@ -1,73 +1,75 @@
1
1
  require 'engtagger'
2
2
 
3
3
  module KnownItemSearchClassifier
4
- class FeatureExtractor
5
- def initialize string
6
- @string = string
7
- tagger = EngTagger.new
8
- @tagged = tagger.get_readable string
9
- @num_words = @tagged.scan(/\/[A-Z]{2}/).size.to_f
10
-
11
- @mixed_case = is_mixed_case?
12
- @punctuation_ratio = punctuation_ratio
13
- @determiner_ratio = determiner_ratio
14
- @proper_noun_ratio = proper_noun_ratio
15
- @numeric_count = numeric_count
16
-
17
- #@num_keywords = count_keywords
18
- #@refers_to_an_item_that_is_known = check_against_known_titles
19
-
20
- end
21
- def feature_array
22
- return [@mixed_case, @punctuation_ratio, @determiner_ratio, @proper_noun_ratio, @num_words, @numeric_count]
23
- end
24
- private
25
- def is_mixed_case?
26
- if @string =~ /[A-Z]/ and @string =~ /[a-z]/
27
- return 1.0
28
- end
29
- return 0.0
30
- end
31
- def punctuation_ratio
32
- num_punct = @tagged.scan(/\/PP/).size.to_f
33
- return num_punct / @num_words
34
- end
35
- def determiner_ratio
36
- num_det = @tagged.scan(/\/DET/).size.to_f
37
- return num_det / @num_words
38
- end
39
- def numeric_count
40
- return @string.scan(/[0-9]/).length
41
- end
42
- def proper_noun_ratio
43
- num_prop_noun = @tagged.scan(/\/NNP/).size.to_f
44
- return num_prop_noun / @num_words
45
- end
46
- def count_keywords
47
- end
48
- def check_against_known_titles
49
- end
50
- def count_keywords
51
- keywords_to_match = ['journal', 'course', 'textbook']
52
- num_keywords = 0
53
- @query_string.split.each do |word|
54
- if keywords_to_match.include? word.gsub(/[[:punct:]]/, '').downcase
55
- num_keywords = num_keywords + 1
56
- end
57
- end
58
- return num_keywords
59
- end
60
- def check_against_known_titles
61
- known_titles = [
62
- 'fountainhead',
63
- 'salt sugar fat',
64
- ]
65
- if known_titles.include? @query_string.downcase
66
- return true
67
- else
68
- return false
69
- end
70
- end
4
+ class FeatureExtractor
5
+ def initialize(string)
6
+ @string = string
7
+ tagger = EngTagger.new
8
+ @tagged = tagger.get_readable string
9
+ @num_words = @tagged.scan(%r{/[A-Z]{2}}).size.to_f
10
+
11
+ @mixed_case = is_mixed_case?
12
+ @punctuation_ratio = punctuation_ratio
13
+ @determiner_ratio = determiner_ratio
14
+ @proper_noun_ratio = proper_noun_ratio
15
+ @numeric_count = numeric_count
16
+
17
+ # @num_keywords = count_keywords
18
+ # @refers_to_an_item_that_is_known = check_against_known_titles
71
19
  end
72
- end
73
20
 
21
+ def feature_array
22
+ [@mixed_case, @punctuation_ratio, @determiner_ratio, @proper_noun_ratio, @num_words, @numeric_count]
23
+ end
24
+
25
+ private
26
+
27
+ def is_mixed_case?
28
+ return 1.0 if @string =~ /[A-Z]/ and @string =~ /[a-z]/
29
+
30
+ 0.0
31
+ end
32
+
33
+ def punctuation_ratio
34
+ num_punct = @tagged.scan(%r{/PP}).size.to_f
35
+ num_punct / @num_words
36
+ end
37
+
38
+ def determiner_ratio
39
+ num_det = @tagged.scan(%r{/DET}).size.to_f
40
+ num_det / @num_words
41
+ end
42
+
43
+ def numeric_count
44
+ @string.scan(/[0-9]/).length
45
+ end
46
+
47
+ def proper_noun_ratio
48
+ num_prop_noun = @tagged.scan(%r{/NNP}).size.to_f
49
+ num_prop_noun / @num_words
50
+ end
51
+
52
+ def count_keywords; end
53
+
54
+ def check_against_known_titles; end
55
+
56
+ def count_keywords
57
+ keywords_to_match = %w[journal course textbook]
58
+ num_keywords = 0
59
+ @query_string.split.each do |word|
60
+ num_keywords += 1 if keywords_to_match.include? word.gsub(/[[:punct:]]/, '').downcase
61
+ end
62
+ num_keywords
63
+ end
64
+
65
+ def check_against_known_titles
66
+ known_titles = [
67
+ 'fountainhead',
68
+ 'salt sugar fat'
69
+ ]
70
+ return true if known_titles.include? @query_string.downcase
71
+
72
+ false
73
+ end
74
+ end
75
+ end
@@ -1,62 +1,49 @@
1
- require 'coveralls'
2
- Coveralls.wear!
3
1
  require 'minitest/autorun'
4
2
  require './lib/known_item_search_classifier'
5
3
 
6
-
7
4
  class KnownItemSearchClassifierTest < Minitest::Test
8
- classifier = KnownItemSearchClassifier::Classifier.new
5
+ classifier = KnownItemSearchClassifier::Classifier.new
9
6
 
10
- known_item_training_set = [
11
- "little house on the",
12
- "the inconvenient truth",
13
- "the question of animal Culture by Kevin N Laland; Bennett G Galef ",
14
- "Robinson Ken. Creative Schools: The Grassroots Revolution That’s Transforming Eduction. Viking. 2015. Print",
15
- "The Boy in Zaquitos",
16
- "The Mis-Education of the Negro",
17
- "human relations interpersonal job-oriented skills",
18
- "Research Methods for Business: A Skill-Building Approach Effectiveness of Instruction Performed through Computer-Assisted Activity Schedules on On-Schedule and Role-Play Skills of Children with Autism Spectrum Disorder",
19
- "competency skills for the dental assiostant",
20
- "Why did they kill?: Cambodia in the shadow of genocide",
21
- "salt sugar fat",
22
- "Making a Killing: Femicide, Free Trade, and La Frontera",
23
- ]
24
- known_item_training_set.each do |query|
25
- cleaned_up_query = query.gsub(/[[:punct:]]/, '').gsub(/[[:space:]]/, '_')
26
- define_method("test_#{cleaned_up_query}_is_classified_as_known_item") do
27
- assert_equal(:known, classifier.is_known_item_search?(query))
28
- end
7
+ known_item_training_set = [
8
+ # 'hobbit first edition', -- classifier incorrectly classifies this as unknown
9
+ # 'my soul is rested', -- classifier incorrectly classifies this as unknown
10
+ # 'new yorker', -- classifier incorrectly classifies this as unknown
11
+ # 'when harry met sally', -- classifier incorrectly classifies this as unknown
12
+ # '"neo tekunoroji"', -- classifier incorrectly classifies this as unknown
13
+ '99131236427206421',
14
+ 'A decision making model for selecting start-up businesses in a government venture capital scheme',
15
+ # 'Dostoevsky Brothers Karamazov', -- classifier incorrectly classifies this as unknown
16
+ # 'Lawrence Classic American Literature', -- classifier incorrectly classifies this as unknown
17
+ # 'salt sugar fat', -- classifier incorrectly classifies this as unknown
18
+ 'Robinson Ken. Creative Schools: The Grassroots Revolution That’s Transforming Eduction. Viking. 2015. Print',
19
+ 'the inconvenient truth',
20
+ 'Polarization: What Everyone Needs to Know',
21
+ 'little house on the'
22
+ ]
23
+ known_item_training_set.each do |query|
24
+ cleaned_up_query = query.gsub(/[[:punct:]]/, '').gsub(/[[:space:]]/, '_')
25
+ define_method("test_#{cleaned_up_query}_is_classified_as_known_item") do
26
+ assert_equal(:known, classifier.is_known_item_search?(query).to_sym)
29
27
  end
28
+ end
30
29
 
31
- unknown_item_training_set = [
32
- "earthworms",
33
- "network security",
34
- "work stress",
35
- "mummies",
36
- "benefits of eating healthyhy",
37
- "benefits of eating healthy",
38
- "megadosing vitamin c",
39
- "nutrition",
40
- "penquin",
41
- "bananas",
42
- "food sourcing",
43
- "whey protein",
44
- "exotic animals",
45
- "sweet home oregon",
46
- "taylor swift",
47
- "catholicism",
48
- "Professional baking ",
49
- "concussions after the nfl",
50
- "IVF the US",
51
- "adoption children the US",
52
- "Films for the hearing impaired",
53
- "wolves and the ecosystem",
54
- "dr. martin luther king",
55
- ]
56
- unknown_item_training_set.each do |query|
57
- cleaned_up_query = query.gsub(/[[:punct:]]/, '').gsub(/[[:space:]]/, '_')
58
- define_method("test_#{cleaned_up_query}_is_not_false_positive") do
59
- assert_equal(:unknown, classifier.is_known_item_search?(query))
60
- end
30
+ unknown_item_training_set = [
31
+ 'colonial mexico textiles',
32
+ 'history of horses',
33
+ 'medical expertise COVID',
34
+ 'music and sexuality',
35
+ 'paper industry',
36
+ 'sun ra',
37
+ # 'concussions after the nfl', -- classifier incorrectly classifies this as known
38
+ 'Professional baking ',
39
+ 'Manos chatzidakis',
40
+ 'whey protein',
41
+ 'benefits of eating healthyhy'
42
+ ]
43
+ unknown_item_training_set.each do |query|
44
+ cleaned_up_query = query.gsub(/[[:punct:]]/, '').gsub(/[[:space:]]/, '_')
45
+ define_method("test_#{cleaned_up_query}_is_not_false_positive") do
46
+ assert_equal(:unknown, classifier.is_known_item_search?(query).to_sym)
61
47
  end
48
+ end
62
49
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: known_item_search_classifier
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jane Sandberg
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-03-25 00:00:00.000000000 Z
11
+ date: 2024-11-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: engtagger
@@ -53,21 +53,35 @@ dependencies:
53
53
  - !ruby/object:Gem::Version
54
54
  version: '0'
55
55
  - !ruby/object:Gem::Dependency
56
- name: coveralls
56
+ name: rake
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
- - - '='
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: rubocop
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
60
74
  - !ruby/object:Gem::Version
61
- version: 0.7.0
75
+ version: '0'
62
76
  type: :development
63
77
  prerelease: false
64
78
  version_requirements: !ruby/object:Gem::Requirement
65
79
  requirements:
66
- - - '='
80
+ - - ">="
67
81
  - !ruby/object:Gem::Version
68
- version: 0.7.0
82
+ version: '0'
69
83
  description: Classify search query strings
70
- email: sandbej@linnbenton.edu
84
+ email:
71
85
  executables: []
72
86
  extensions: []
73
87
  extra_rdoc_files: []
@@ -81,7 +95,7 @@ homepage: https://github.com/sandbergja/known_item_search_classifier
81
95
  licenses:
82
96
  - MIT
83
97
  metadata: {}
84
- post_install_message:
98
+ post_install_message:
85
99
  rdoc_options: []
86
100
  require_paths:
87
101
  - lib
@@ -89,16 +103,15 @@ required_ruby_version: !ruby/object:Gem::Requirement
89
103
  requirements:
90
104
  - - ">="
91
105
  - !ruby/object:Gem::Version
92
- version: '0'
106
+ version: 3.0.0
93
107
  required_rubygems_version: !ruby/object:Gem::Requirement
94
108
  requirements:
95
109
  - - ">="
96
110
  - !ruby/object:Gem::Version
97
111
  version: '0'
98
112
  requirements: []
99
- rubyforge_project:
100
- rubygems_version: 2.5.1
101
- signing_key:
113
+ rubygems_version: 3.5.16
114
+ signing_key:
102
115
  specification_version: 4
103
116
  summary: A ruby gem that classifies search query strings as either known-item searches
104
117
  or unknown-item searches