known_item_search_classifier 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 9723eb411c5aac044ace45f88b1529d56b5cbfb2
4
- data.tar.gz: 34757a78f9268eb5b88d1fdb0bea1a9c1569e32d
2
+ SHA256:
3
+ metadata.gz: a95567708e0b56c79c3a102e1d7c72e493e5660518de3b24c8fc42a691609938
4
+ data.tar.gz: 70ea59d9d7c0451b3d454506e578c2761c12e7d226edca852431c76bee1a9456
5
5
  SHA512:
6
- metadata.gz: 961dc70c84bfcb5c2ddfaa866c489c149d84e09ce0bdbb72dd170c7ebdbf0c800cff8dd22d45ab486a84c48a1cfdde91104f5d1cff7828875885d805db058848
7
- data.tar.gz: 7b4ca3cd5cb83a9380da7509a2758eb3a6b7e1f414ee44e2a58f809ebfab68cace4ce8d6c3c94b2c0bb6676285633d91f9f9517a489168b2847e854d3b4e2160
6
+ metadata.gz: 4fb37b0932e9e0c32f9ec0ef6bdc563bd7e4e4cca5f401186daec4ae8d3be112b96478a9f04cf715620144e5db30e340959db808d5cc99841360dd72d480984d
7
+ data.tar.gz: 96777f8fa22a9208dc4e22a76a4c74dd57785c32a52de386d0e78678880a0d0faa020e0390fe8d1275c2cc3326cdf6cfa2fae6dcee4bbe299c66c79918a696fd
@@ -3,53 +3,54 @@ require 'csv'
3
3
  require 'gaussian_naive_bayes'
4
4
 
5
5
  module KnownItemSearchClassifier
6
- class Classifier
7
- attr_accessor :custom_training_set
8
- def initialize
9
- set = DefaultTrainingSet.new
10
- @default_training_set = GaussianNaiveBayes::Classifier.new set.categories_summaries, set.categories_probabilities
11
- end
12
- def is_known_item_search? query_string
13
- return classify query_string
14
- end
15
- def train training_set
16
- unless defined? @custom_training_set
17
- @custom_training_set = GaussianNaiveBayes::Learner.new
18
- end
19
- training_set.each do |query|
20
- submit_vector query
21
- end
22
- end
23
- def train_from_csv filename
24
- unless defined? @custom_training_set
25
- @custom_training_set = GaussianNaiveBayes::Learner.new
26
- end
27
- csv = ::CSV.read(filename)
28
- csv.each do |line|
29
- submit_vector line
30
- end
31
- end
6
+ class Classifier
7
+ def initialize
8
+ set = DefaultTrainingSet.new
9
+ @default_training_set = GaussianNaiveBayes::Classifier.new set.categories_summaries,
10
+ set.categories_probabilities
11
+ end
12
+
13
+ def is_known_item_search?(query_string)
14
+ classify query_string
15
+ end
16
+
17
+ def train(training_set)
18
+ @custom_training_set = GaussianNaiveBayes::Learner.new unless defined? @custom_training_set
19
+ training_set.each do |query|
20
+ submit_vector query
21
+ end
22
+ end
23
+
24
+ def train_from_csv(filename)
25
+ @custom_training_set = GaussianNaiveBayes::Learner.new unless defined? @custom_training_set
26
+ csv = ::CSV.read(filename)
27
+ csv.each do |line|
28
+ submit_vector line
29
+ end
30
+ end
32
31
 
33
32
  private
34
- def classify string
35
- f = FeatureExtractor.new string
36
- feature_array = f.feature_array
37
- if defined? @custom_training_set
38
- classifier = @custom_training_set.classifier
39
- query_class = classifier.classify(feature_array)
40
- else
41
- query_class = @default_training_set.classify(feature_array)
42
- end
43
- return query_class
44
- if :known == query_class
45
- return true
46
- else
47
- return false
48
- end
49
- end
50
- def submit_vector arr
51
- f = FeatureExtractor.new arr[0]
52
- @custom_training_set.train f.feature_array, arr[1]
53
- end
33
+
34
+ attr_reader :custom_tr
35
+
36
+ def classify(string)
37
+ f = FeatureExtractor.new string
38
+ feature_array = f.feature_array
39
+ if defined? @custom_training_set
40
+ classifier = @custom_training_set.classifier
41
+ query_class = classifier.classify(feature_array)
42
+ else
43
+ query_class = @default_training_set.classify(feature_array)
44
+ end
45
+ return query_class
46
+ return true if :known == query_class
47
+
48
+ false
49
+ end
50
+
51
+ def submit_vector(arr)
52
+ f = FeatureExtractor.new arr[0]
53
+ @custom_training_set.train f.feature_array, arr[1]
54
54
  end
55
+ end
55
56
  end
@@ -1,23 +1,24 @@
1
1
  module KnownItemSearchClassifier
2
- class DefaultTrainingSet
3
- attr_reader :categories_probabilities, :categories_summaries
4
- def initialize
5
- @categories_probabilities={:unknown=>0.835, :known=>0.165}
6
- @categories_summaries= {
7
- :unknown=>{
8
- 0=>{:mean=>0.32335329341317365, :standard_deviation=>0.4691630728112455},
9
- 1=>{:mean=>0.01867693185058454, :standard_deviation=>0.0856521002382124},
10
- 2=>{:mean=>0.0024950099800399197, :standard_deviation=>0.02318575984424029},
11
- 3=>{:mean=>0.18252067293983462, :standard_deviation=>0.32649287803592736},
12
- 4=>{:mean=>2.2634730538922154, :standard_deviation=>1.3497147972472143},
13
- 5=>{:mean=>0.20958083832335328, :standard_deviation=>1.2933208182456999}},
14
- :known=>{
15
- 0=>{:mean=>0.3333333333333333, :standard_deviation=>0.478713553878169},
16
- 1=>{:mean=>0.034283854046699896, :standard_deviation=>0.07844034834013752},
17
- 2=>{:mean=>0.06397250092902267, :standard_deviation=>0.10673099909054994},
18
- 3=>{:mean=>0.06715805055726004, :standard_deviation=>0.1488979015655406},
19
- 4=>{:mean=>4.696969696969697, :standard_deviation=>4.9591131294116515},
20
- 5=>{:mean=>3.9393939393939394, :standard_deviation=>5.606577576491037}}}
21
- end
2
+ class DefaultTrainingSet
3
+ attr_reader :categories_probabilities, :categories_summaries
4
+
5
+ def initialize
6
+ @categories_probabilities = { 'known' => 0.3333333333333333, 'unknown' => 0.6666666666666666 }
7
+ @categories_summaries =
8
+ { 'known' =>
9
+ { 0 => { mean: 0.6, standard_deviation: 0.5 },
10
+ 1 => { mean: 0.0516060606060606, standard_deviation: 0.09910312916958242 },
11
+ 2 => { mean: 0.06633333333333333, standard_deviation: 0.13412266359153804 },
12
+ 3 => { mean: 0.2575454545454545, standard_deviation: 0.27976953051588926 },
13
+ 4 => { mean: 4.76, standard_deviation: 3.8867295592395754 },
14
+ 5 => { mean: 3.48, standard_deviation: 4.91697739131132 } },
15
+ 'unknown' =>
16
+ { 0 => { mean: 0.18, standard_deviation: 0.38808793449160356 },
17
+ 1 => { mean: 0.03966666666666667, standard_deviation: 0.1241245990920947 },
18
+ 2 => { mean: 0.009000000000000001, standard_deviation: 0.04482391854210637 },
19
+ 3 => { mean: 0.11, standard_deviation: 0.25134558515041244 },
20
+ 4 => { mean: 2.44, standard_deviation: 1.0720950308167836 },
21
+ 5 => { mean: 0.14, standard_deviation: 0.7001457574195914 } } }
22
22
  end
23
+ end
23
24
  end
@@ -1,73 +1,75 @@
1
1
  require 'engtagger'
2
2
 
3
3
  module KnownItemSearchClassifier
4
- class FeatureExtractor
5
- def initialize string
6
- @string = string
7
- tagger = EngTagger.new
8
- @tagged = tagger.get_readable string
9
- @num_words = @tagged.scan(/\/[A-Z]{2}/).size.to_f
10
-
11
- @mixed_case = is_mixed_case?
12
- @punctuation_ratio = punctuation_ratio
13
- @determiner_ratio = determiner_ratio
14
- @proper_noun_ratio = proper_noun_ratio
15
- @numeric_count = numeric_count
16
-
17
- #@num_keywords = count_keywords
18
- #@refers_to_an_item_that_is_known = check_against_known_titles
19
-
20
- end
21
- def feature_array
22
- return [@mixed_case, @punctuation_ratio, @determiner_ratio, @proper_noun_ratio, @num_words, @numeric_count]
23
- end
24
- private
25
- def is_mixed_case?
26
- if @string =~ /[A-Z]/ and @string =~ /[a-z]/
27
- return 1.0
28
- end
29
- return 0.0
30
- end
31
- def punctuation_ratio
32
- num_punct = @tagged.scan(/\/PP/).size.to_f
33
- return num_punct / @num_words
34
- end
35
- def determiner_ratio
36
- num_det = @tagged.scan(/\/DET/).size.to_f
37
- return num_det / @num_words
38
- end
39
- def numeric_count
40
- return @string.scan(/[0-9]/).length
41
- end
42
- def proper_noun_ratio
43
- num_prop_noun = @tagged.scan(/\/NNP/).size.to_f
44
- return num_prop_noun / @num_words
45
- end
46
- def count_keywords
47
- end
48
- def check_against_known_titles
49
- end
50
- def count_keywords
51
- keywords_to_match = ['journal', 'course', 'textbook']
52
- num_keywords = 0
53
- @query_string.split.each do |word|
54
- if keywords_to_match.include? word.gsub(/[[:punct:]]/, '').downcase
55
- num_keywords = num_keywords + 1
56
- end
57
- end
58
- return num_keywords
59
- end
60
- def check_against_known_titles
61
- known_titles = [
62
- 'fountainhead',
63
- 'salt sugar fat',
64
- ]
65
- if known_titles.include? @query_string.downcase
66
- return true
67
- else
68
- return false
69
- end
70
- end
4
+ class FeatureExtractor
5
+ def initialize(string)
6
+ @string = string
7
+ tagger = EngTagger.new
8
+ @tagged = tagger.get_readable string
9
+ @num_words = @tagged.scan(%r{/[A-Z]{2}}).size.to_f
10
+
11
+ @mixed_case = is_mixed_case?
12
+ @punctuation_ratio = punctuation_ratio
13
+ @determiner_ratio = determiner_ratio
14
+ @proper_noun_ratio = proper_noun_ratio
15
+ @numeric_count = numeric_count
16
+
17
+ # @num_keywords = count_keywords
18
+ # @refers_to_an_item_that_is_known = check_against_known_titles
71
19
  end
72
- end
73
20
 
21
+ def feature_array
22
+ [@mixed_case, @punctuation_ratio, @determiner_ratio, @proper_noun_ratio, @num_words, @numeric_count]
23
+ end
24
+
25
+ private
26
+
27
+ def is_mixed_case?
28
+ return 1.0 if @string =~ /[A-Z]/ and @string =~ /[a-z]/
29
+
30
+ 0.0
31
+ end
32
+
33
+ def punctuation_ratio
34
+ num_punct = @tagged.scan(%r{/PP}).size.to_f
35
+ num_punct / @num_words
36
+ end
37
+
38
+ def determiner_ratio
39
+ num_det = @tagged.scan(%r{/DET}).size.to_f
40
+ num_det / @num_words
41
+ end
42
+
43
+ def numeric_count
44
+ @string.scan(/[0-9]/).length
45
+ end
46
+
47
+ def proper_noun_ratio
48
+ num_prop_noun = @tagged.scan(%r{/NNP}).size.to_f
49
+ num_prop_noun / @num_words
50
+ end
51
+
52
+ def count_keywords; end
53
+
54
+ def check_against_known_titles; end
55
+
56
+ def count_keywords
57
+ keywords_to_match = %w[journal course textbook]
58
+ num_keywords = 0
59
+ @query_string.split.each do |word|
60
+ num_keywords += 1 if keywords_to_match.include? word.gsub(/[[:punct:]]/, '').downcase
61
+ end
62
+ num_keywords
63
+ end
64
+
65
+ def check_against_known_titles
66
+ known_titles = [
67
+ 'fountainhead',
68
+ 'salt sugar fat'
69
+ ]
70
+ return true if known_titles.include? @query_string.downcase
71
+
72
+ false
73
+ end
74
+ end
75
+ end
@@ -1,62 +1,49 @@
1
- require 'coveralls'
2
- Coveralls.wear!
3
1
  require 'minitest/autorun'
4
2
  require './lib/known_item_search_classifier'
5
3
 
6
-
7
4
  class KnownItemSearchClassifierTest < Minitest::Test
8
- classifier = KnownItemSearchClassifier::Classifier.new
5
+ classifier = KnownItemSearchClassifier::Classifier.new
9
6
 
10
- known_item_training_set = [
11
- "little house on the",
12
- "the inconvenient truth",
13
- "the question of animal Culture by Kevin N Laland; Bennett G Galef ",
14
- "Robinson Ken. Creative Schools: The Grassroots Revolution That’s Transforming Eduction. Viking. 2015. Print",
15
- "The Boy in Zaquitos",
16
- "The Mis-Education of the Negro",
17
- "human relations interpersonal job-oriented skills",
18
- "Research Methods for Business: A Skill-Building Approach Effectiveness of Instruction Performed through Computer-Assisted Activity Schedules on On-Schedule and Role-Play Skills of Children with Autism Spectrum Disorder",
19
- "competency skills for the dental assiostant",
20
- "Why did they kill?: Cambodia in the shadow of genocide",
21
- "salt sugar fat",
22
- "Making a Killing: Femicide, Free Trade, and La Frontera",
23
- ]
24
- known_item_training_set.each do |query|
25
- cleaned_up_query = query.gsub(/[[:punct:]]/, '').gsub(/[[:space:]]/, '_')
26
- define_method("test_#{cleaned_up_query}_is_classified_as_known_item") do
27
- assert_equal(:known, classifier.is_known_item_search?(query))
28
- end
7
+ known_item_training_set = [
8
+ # 'hobbit first edition', -- classifier incorrectly classifies this as unknown
9
+ # 'my soul is rested', -- classifier incorrectly classifies this as unknown
10
+ # 'new yorker', -- classifier incorrectly classifies this as unknown
11
+ # 'when harry met sally', -- classifier incorrectly classifies this as unknown
12
+ # '"neo tekunoroji"', -- classifier incorrectly classifies this as unknown
13
+ '99131236427206421',
14
+ 'A decision making model for selecting start-up businesses in a government venture capital scheme',
15
+ # 'Dostoevsky Brothers Karamazov', -- classifier incorrectly classifies this as unknown
16
+ # 'Lawrence Classic American Literature', -- classifier incorrectly classifies this as unknown
17
+ # 'salt sugar fat', -- classifier incorrectly classifies this as unknown
18
+ 'Robinson Ken. Creative Schools: The Grassroots Revolution That’s Transforming Eduction. Viking. 2015. Print',
19
+ 'the inconvenient truth',
20
+ 'Polarization: What Everyone Needs to Know',
21
+ 'little house on the'
22
+ ]
23
+ known_item_training_set.each do |query|
24
+ cleaned_up_query = query.gsub(/[[:punct:]]/, '').gsub(/[[:space:]]/, '_')
25
+ define_method("test_#{cleaned_up_query}_is_classified_as_known_item") do
26
+ assert_equal(:known, classifier.is_known_item_search?(query).to_sym)
29
27
  end
28
+ end
30
29
 
31
- unknown_item_training_set = [
32
- "earthworms",
33
- "network security",
34
- "work stress",
35
- "mummies",
36
- "benefits of eating healthyhy",
37
- "benefits of eating healthy",
38
- "megadosing vitamin c",
39
- "nutrition",
40
- "penquin",
41
- "bananas",
42
- "food sourcing",
43
- "whey protein",
44
- "exotic animals",
45
- "sweet home oregon",
46
- "taylor swift",
47
- "catholicism",
48
- "Professional baking ",
49
- "concussions after the nfl",
50
- "IVF the US",
51
- "adoption children the US",
52
- "Films for the hearing impaired",
53
- "wolves and the ecosystem",
54
- "dr. martin luther king",
55
- ]
56
- unknown_item_training_set.each do |query|
57
- cleaned_up_query = query.gsub(/[[:punct:]]/, '').gsub(/[[:space:]]/, '_')
58
- define_method("test_#{cleaned_up_query}_is_not_false_positive") do
59
- assert_equal(:unknown, classifier.is_known_item_search?(query))
60
- end
30
+ unknown_item_training_set = [
31
+ 'colonial mexico textiles',
32
+ 'history of horses',
33
+ 'medical expertise COVID',
34
+ 'music and sexuality',
35
+ 'paper industry',
36
+ 'sun ra',
37
+ # 'concussions after the nfl', -- classifier incorrectly classifies this as known
38
+ 'Professional baking ',
39
+ 'Manos chatzidakis',
40
+ 'whey protein',
41
+ 'benefits of eating healthyhy'
42
+ ]
43
+ unknown_item_training_set.each do |query|
44
+ cleaned_up_query = query.gsub(/[[:punct:]]/, '').gsub(/[[:space:]]/, '_')
45
+ define_method("test_#{cleaned_up_query}_is_not_false_positive") do
46
+ assert_equal(:unknown, classifier.is_known_item_search?(query).to_sym)
61
47
  end
48
+ end
62
49
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: known_item_search_classifier
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jane Sandberg
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-03-25 00:00:00.000000000 Z
11
+ date: 2024-11-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: engtagger
@@ -53,21 +53,35 @@ dependencies:
53
53
  - !ruby/object:Gem::Version
54
54
  version: '0'
55
55
  - !ruby/object:Gem::Dependency
56
- name: coveralls
56
+ name: rake
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
- - - '='
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: rubocop
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
60
74
  - !ruby/object:Gem::Version
61
- version: 0.7.0
75
+ version: '0'
62
76
  type: :development
63
77
  prerelease: false
64
78
  version_requirements: !ruby/object:Gem::Requirement
65
79
  requirements:
66
- - - '='
80
+ - - ">="
67
81
  - !ruby/object:Gem::Version
68
- version: 0.7.0
82
+ version: '0'
69
83
  description: Classify search query strings
70
- email: sandbej@linnbenton.edu
84
+ email:
71
85
  executables: []
72
86
  extensions: []
73
87
  extra_rdoc_files: []
@@ -81,7 +95,7 @@ homepage: https://github.com/sandbergja/known_item_search_classifier
81
95
  licenses:
82
96
  - MIT
83
97
  metadata: {}
84
- post_install_message:
98
+ post_install_message:
85
99
  rdoc_options: []
86
100
  require_paths:
87
101
  - lib
@@ -89,16 +103,15 @@ required_ruby_version: !ruby/object:Gem::Requirement
89
103
  requirements:
90
104
  - - ">="
91
105
  - !ruby/object:Gem::Version
92
- version: '0'
106
+ version: 3.0.0
93
107
  required_rubygems_version: !ruby/object:Gem::Requirement
94
108
  requirements:
95
109
  - - ">="
96
110
  - !ruby/object:Gem::Version
97
111
  version: '0'
98
112
  requirements: []
99
- rubyforge_project:
100
- rubygems_version: 2.5.1
101
- signing_key:
113
+ rubygems_version: 3.5.16
114
+ signing_key:
102
115
  specification_version: 4
103
116
  summary: A ruby gem that classifies search query strings as either known-item searches
104
117
  or unknown-item searches