known_item_search_classifier 0.3.0 → 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a95567708e0b56c79c3a102e1d7c72e493e5660518de3b24c8fc42a691609938
4
- data.tar.gz: 70ea59d9d7c0451b3d454506e578c2761c12e7d226edca852431c76bee1a9456
3
+ metadata.gz: 026e2cc9387f7e1bbfd52bcf37d24a32e5f44665f4217e8e87276fff08cd9951
4
+ data.tar.gz: 2c4d42876c29118a017397d1cb9a55a6e79323cbca71929b6f1ff72b414fd353
5
5
  SHA512:
6
- metadata.gz: 4fb37b0932e9e0c32f9ec0ef6bdc563bd7e4e4cca5f401186daec4ae8d3be112b96478a9f04cf715620144e5db30e340959db808d5cc99841360dd72d480984d
7
- data.tar.gz: 96777f8fa22a9208dc4e22a76a4c74dd57785c32a52de386d0e78678880a0d0faa020e0390fe8d1275c2cc3326cdf6cfa2fae6dcee4bbe299c66c79918a696fd
6
+ metadata.gz: '0295cfeddf0ed53d31a39053b5cbded6ef311eaf319a056564889953a592aa89e379a10274b7972f9109b8066fd594932b4511d2cbf4da72b06df65643cfba2b'
7
+ data.tar.gz: ec9c1cfb9ad88193d7c8b0d52d31b0b9744c4d2862f4fc868240678219632cb6a364c4ff14180de66d5aa95e5a4f293de85bea45fc7d132069b41d9ec9c1e818
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  # Classifies search strings as either known-item searches or unknown-item searches
2
4
  require 'csv'
3
5
  require 'gaussian_naive_bayes'
@@ -29,9 +31,9 @@ module KnownItemSearchClassifier
29
31
  end
30
32
  end
31
33
 
32
- private
34
+ attr_reader :custom_training_set
33
35
 
34
- attr_reader :custom_tr
36
+ private
35
37
 
36
38
  def classify(string)
37
39
  f = FeatureExtractor.new string
@@ -43,7 +45,7 @@ module KnownItemSearchClassifier
43
45
  query_class = @default_training_set.classify(feature_array)
44
46
  end
45
47
  return query_class
46
- return true if :known == query_class
48
+ return true if query_class == :known
47
49
 
48
50
  false
49
51
  end
@@ -1,24 +1,26 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module KnownItemSearchClassifier
2
4
  class DefaultTrainingSet
3
- attr_reader :categories_probabilities, :categories_summaries
5
+ def categories_probabilities
6
+ { 'unknown' => 0.552, 'known' => 0.448 }
7
+ end
4
8
 
5
- def initialize
6
- @categories_probabilities = { 'known' => 0.3333333333333333, 'unknown' => 0.6666666666666666 }
7
- @categories_summaries =
8
- { 'known' =>
9
- { 0 => { mean: 0.6, standard_deviation: 0.5 },
10
- 1 => { mean: 0.0516060606060606, standard_deviation: 0.09910312916958242 },
11
- 2 => { mean: 0.06633333333333333, standard_deviation: 0.13412266359153804 },
12
- 3 => { mean: 0.2575454545454545, standard_deviation: 0.27976953051588926 },
13
- 4 => { mean: 4.76, standard_deviation: 3.8867295592395754 },
14
- 5 => { mean: 3.48, standard_deviation: 4.91697739131132 } },
15
- 'unknown' =>
16
- { 0 => { mean: 0.18, standard_deviation: 0.38808793449160356 },
17
- 1 => { mean: 0.03966666666666667, standard_deviation: 0.1241245990920947 },
18
- 2 => { mean: 0.009000000000000001, standard_deviation: 0.04482391854210637 },
19
- 3 => { mean: 0.11, standard_deviation: 0.25134558515041244 },
20
- 4 => { mean: 2.44, standard_deviation: 1.0720950308167836 },
21
- 5 => { mean: 0.14, standard_deviation: 0.7001457574195914 } } }
9
+ def categories_summaries
10
+ { 'unknown' =>
11
+ { 0 => { mean: 0.34782608695652173, standard_deviation: 0.4774351058385226 },
12
+ 1 => { mean: 0.024483651893859832, standard_deviation: 0.09877772267472562 },
13
+ 2 => { mean: 0.004589371980676328, standard_deviation: 0.032912070012630565 },
14
+ 3 => { mean: 0.23732759224252611, standard_deviation: 0.3775861432826544 },
15
+ 4 => { mean: 2.5072463768115942, standard_deviation: 1.7511981064219535 },
16
+ 5 => { mean: 0.05314009661835749, standard_deviation: 0.44293515127930905 } },
17
+ 'known' =>
18
+ { 0 => { mean: 0.6011904761904762, standard_deviation: 0.49111727628992086 },
19
+ 1 => { mean: 0.05071208898451003, standard_deviation: 0.10551578513388955 },
20
+ 2 => { mean: 0.043913293286663324, standard_deviation: 0.089967379556707 },
21
+ 3 => { mean: 0.24845155590636964, standard_deviation: 0.3038306400149365 },
22
+ 4 => { mean: 5.071428571428571, standard_deviation: 4.292552286529769 },
23
+ 5 => { mean: 2.8214285714285716, standard_deviation: 4.933475325294178 } } }
22
24
  end
23
25
  end
24
26
  end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'engtagger'
2
4
 
3
5
  module KnownItemSearchClassifier
@@ -25,7 +27,7 @@ module KnownItemSearchClassifier
25
27
  private
26
28
 
27
29
  def is_mixed_case?
28
- return 1.0 if @string =~ /[A-Z]/ and @string =~ /[a-z]/
30
+ return 1.0 if @string =~ (/[A-Z]/) && @string =~ (/[a-z]/)
29
31
 
30
32
  0.0
31
33
  end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'known_item_search_classifier/default_training_set'
2
4
  require 'known_item_search_classifier/feature_extractor'
3
5
  require 'known_item_search_classifier/classifier'
@@ -1,46 +1,148 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'minitest/autorun'
2
4
  require './lib/known_item_search_classifier'
3
5
 
4
6
  class KnownItemSearchClassifierTest < Minitest::Test
5
7
  classifier = KnownItemSearchClassifier::Classifier.new
6
8
 
7
- known_item_training_set = [
8
- # 'hobbit first edition', -- classifier incorrectly classifies this as unknown
9
- # 'my soul is rested', -- classifier incorrectly classifies this as unknown
10
- # 'new yorker', -- classifier incorrectly classifies this as unknown
11
- # 'when harry met sally', -- classifier incorrectly classifies this as unknown
12
- # '"neo tekunoroji"', -- classifier incorrectly classifies this as unknown
13
- '99131236427206421',
14
- 'A decision making model for selecting start-up businesses in a government venture capital scheme',
15
- # 'Dostoevsky Brothers Karamazov', -- classifier incorrectly classifies this as unknown
16
- # 'Lawrence Classic American Literature', -- classifier incorrectly classifies this as unknown
17
- # 'salt sugar fat', -- classifier incorrectly classifies this as unknown
18
- 'Robinson Ken. Creative Schools: The Grassroots Revolution That’s Transforming Eduction. Viking. 2015. Print',
19
- 'the inconvenient truth',
20
- 'Polarization: What Everyone Needs to Know',
21
- 'little house on the'
9
+ known_items = [
10
+ '0036-8075',
11
+ '9781324033356',
12
+ 'British Documents on the End of Empire',
13
+ '9780393979503',
14
+ '9781319339425',
15
+ 'D810.W7 A5313 2017b',
16
+ '9783031512117',
17
+ '9780470044377',
18
+ '9780393609615',
19
+ 'AC8 .B4353 1979',
20
+ # 'Kafka metamorphosis',
21
+ # 'how children develop',
22
+ # 'milton\'s paradise lost',
23
+ '9780471140260',
24
+ # 'Fortaleciendo la participación política de las mujeres',
25
+ # 'historia augusta',
26
+ # 'karl marx letters',
27
+ # 'stuck in place',
28
+ 'Court and Country: Studies in Tudor Social History',
29
+ 'E184.6 .L48 2009',
30
+ 'JK274 .K62 1995',
31
+ 'JZ1480 .H68 2013',
32
+ 'TD186.5.B35 A4413 2016',
33
+ 'Women and gender issues in Bolivia, II, 1991-2003.',
34
+ # 'arrow of god',
35
+ # 'going stealth',
36
+ # 'greek dictionary brill',
37
+ # 'historia de los indios motolinia',
38
+ '9780143115267',
39
+ '9780321573513',
40
+ '9788498956160',
41
+ 'AC25 .B3132 1972b',
42
+ 'Agrarian issues in Bolivia, I, 1989-2004.',
43
+ 'Indigenous peoples, peasants, and ethnic minorities in Bolivia, I, 1970-2005.',
44
+ 'MT6 .C57 2016',
45
+ # 'Plato\'s symposium',
46
+ 'le droit a la ville',
47
+ # 'providence journal',
48
+ 'the atlantic',
49
+ '9780137605521',
50
+ '9788876768170',
51
+ # 'Book of Mormon',
52
+ # 'Euclid\'s elements'
22
53
  ]
23
- known_item_training_set.each do |query|
54
+ known_items.each do |query|
24
55
  cleaned_up_query = query.gsub(/[[:punct:]]/, '').gsub(/[[:space:]]/, '_')
25
56
  define_method("test_#{cleaned_up_query}_is_classified_as_known_item") do
26
57
  assert_equal(:known, classifier.is_known_item_search?(query).to_sym)
27
58
  end
28
59
  end
29
60
 
30
- unknown_item_training_set = [
31
- 'colonial mexico textiles',
32
- 'history of horses',
33
- 'medical expertise COVID',
34
- 'music and sexuality',
35
- 'paper industry',
36
- 'sun ra',
37
- # 'concussions after the nfl', -- classifier incorrectly classifies this as known
38
- 'Professional baking ',
39
- 'Manos chatzidakis',
40
- 'whey protein',
41
- 'benefits of eating healthyhy'
61
+ unknown_items = [
62
+ '"paul parin"',
63
+ 'architecture new orleans',
64
+ 'Al-alusi',
65
+ 'animals NOT coin / ',
66
+ 'hanne darboven',
67
+ 'William Burroughs',
68
+ 'Platonic eros',
69
+ 'cavafy',
70
+ 'Marcel proust',
71
+ 'Zisterzienser',
72
+ 'costume',
73
+ 'cultural delegation',
74
+ 'Akhir al-Zaman',
75
+ 'Heidegger',
76
+ 'Nietzsche',
77
+ 'Wind in poetry',
78
+ 'argentina',
79
+ 'facades architecture',
80
+ 'medieval women',
81
+ 'monrovia urbanization',
82
+ 'philadelphia',
83
+ 'roman theater',
84
+ 'Holderline',
85
+ 'Wittgenstein',
86
+ 'hindi history politics',
87
+ 'post-industrial society',
88
+ 'Luxemburg',
89
+ 'Tasso',
90
+ 'Women in Scandinavia',
91
+ 'corporate governance',
92
+ 'dance technique',
93
+ 'water atlas',
94
+ 'Diskin Clay',
95
+ 'charlotte posenenske',
96
+ 'dinosaur',
97
+ 'paul rudolph',
98
+ 'urdu hindi history',
99
+ 'Ashrat al',
100
+ 'Ephemera',
101
+ 'John Hopfield',
102
+ 'Russian formalism',
103
+ 'artificial intelligence',
104
+ 'bats',
105
+ # 'coffee and the american revolution',
106
+ 'cold war middle east',
107
+ 'environmental history ethiopia',
108
+ 'indigenous',
109
+ 'politbiuro',
110
+ 'stoicism',
111
+ 'suetonius',
112
+ 'tokyo edo',
113
+ 'washington dc',
114
+ 'Achaean war',
115
+ 'Isabella piccini',
116
+ 'Renato Sollima',
117
+ 'animals',
118
+ 'auguste leroux',
119
+ 'blockchain',
120
+ 'britain navy diary',
121
+ 'fairy',
122
+ 'jose guadalupe posada',
123
+ 'lesbian',
124
+ 'physics textbook',
125
+ 'walter benjamin',
126
+ '"Middlebury College Museum of Art"',
127
+ '"daniel catan"',
128
+ 'Hellenistic kingdoms',
129
+ 'Albrecht Classen',
130
+ 'Athir al-din al-abhari',
131
+ 'Berlin',
132
+ 'David konstan',
133
+ 'Elizabeth Cary Mariam',
134
+ 'Euclid',
135
+ 'George sand on literature',
136
+ 'Kyrgyz art',
137
+ 'Slavic folklore',
138
+ 'mass incarceration california',
139
+ 'military history korean war',
140
+ 'snakes',
141
+ 'sondheim',
142
+ '"bronx"',
143
+ 'Between Hermon and Sinai'
42
144
  ]
43
- unknown_item_training_set.each do |query|
145
+ unknown_items.each do |query|
44
146
  cleaned_up_query = query.gsub(/[[:punct:]]/, '').gsub(/[[:space:]]/, '_')
45
147
  define_method("test_#{cleaned_up_query}_is_not_false_positive") do
46
148
  assert_equal(:unknown, classifier.is_known_item_search?(query).to_sym)
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: known_item_search_classifier
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jane Sandberg