known_item_search_classifier 0.3.0 → 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/known_item_search_classifier/classifier.rb +5 -3
- data/lib/known_item_search_classifier/default_training_set.rb +20 -18
- data/lib/known_item_search_classifier/feature_extractor.rb +3 -1
- data/lib/known_item_search_classifier.rb +2 -0
- data/test/known_item_search_classifier_test.rb +131 -29
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 026e2cc9387f7e1bbfd52bcf37d24a32e5f44665f4217e8e87276fff08cd9951
|
4
|
+
data.tar.gz: 2c4d42876c29118a017397d1cb9a55a6e79323cbca71929b6f1ff72b414fd353
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: '0295cfeddf0ed53d31a39053b5cbded6ef311eaf319a056564889953a592aa89e379a10274b7972f9109b8066fd594932b4511d2cbf4da72b06df65643cfba2b'
|
7
|
+
data.tar.gz: ec9c1cfb9ad88193d7c8b0d52d31b0b9744c4d2862f4fc868240678219632cb6a364c4ff14180de66d5aa95e5a4f293de85bea45fc7d132069b41d9ec9c1e818
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
# Classifies search strings as either known-item searches or unknown-item searches
|
2
4
|
require 'csv'
|
3
5
|
require 'gaussian_naive_bayes'
|
@@ -29,9 +31,9 @@ module KnownItemSearchClassifier
|
|
29
31
|
end
|
30
32
|
end
|
31
33
|
|
32
|
-
|
34
|
+
attr_reader :custom_training_set
|
33
35
|
|
34
|
-
|
36
|
+
private
|
35
37
|
|
36
38
|
def classify(string)
|
37
39
|
f = FeatureExtractor.new string
|
@@ -43,7 +45,7 @@ module KnownItemSearchClassifier
|
|
43
45
|
query_class = @default_training_set.classify(feature_array)
|
44
46
|
end
|
45
47
|
return query_class
|
46
|
-
return true if
|
48
|
+
return true if query_class == :known
|
47
49
|
|
48
50
|
false
|
49
51
|
end
|
@@ -1,24 +1,26 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module KnownItemSearchClassifier
|
2
4
|
class DefaultTrainingSet
|
3
|
-
|
5
|
+
def categories_probabilities
|
6
|
+
{ 'unknown' => 0.552, 'known' => 0.448 }
|
7
|
+
end
|
4
8
|
|
5
|
-
def
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
4 => { mean: 2.44, standard_deviation: 1.0720950308167836 },
|
21
|
-
5 => { mean: 0.14, standard_deviation: 0.7001457574195914 } } }
|
9
|
+
def categories_summaries
|
10
|
+
{ 'unknown' =>
|
11
|
+
{ 0 => { mean: 0.34782608695652173, standard_deviation: 0.4774351058385226 },
|
12
|
+
1 => { mean: 0.024483651893859832, standard_deviation: 0.09877772267472562 },
|
13
|
+
2 => { mean: 0.004589371980676328, standard_deviation: 0.032912070012630565 },
|
14
|
+
3 => { mean: 0.23732759224252611, standard_deviation: 0.3775861432826544 },
|
15
|
+
4 => { mean: 2.5072463768115942, standard_deviation: 1.7511981064219535 },
|
16
|
+
5 => { mean: 0.05314009661835749, standard_deviation: 0.44293515127930905 } },
|
17
|
+
'known' =>
|
18
|
+
{ 0 => { mean: 0.6011904761904762, standard_deviation: 0.49111727628992086 },
|
19
|
+
1 => { mean: 0.05071208898451003, standard_deviation: 0.10551578513388955 },
|
20
|
+
2 => { mean: 0.043913293286663324, standard_deviation: 0.089967379556707 },
|
21
|
+
3 => { mean: 0.24845155590636964, standard_deviation: 0.3038306400149365 },
|
22
|
+
4 => { mean: 5.071428571428571, standard_deviation: 4.292552286529769 },
|
23
|
+
5 => { mean: 2.8214285714285716, standard_deviation: 4.933475325294178 } } }
|
22
24
|
end
|
23
25
|
end
|
24
26
|
end
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'engtagger'
|
2
4
|
|
3
5
|
module KnownItemSearchClassifier
|
@@ -25,7 +27,7 @@ module KnownItemSearchClassifier
|
|
25
27
|
private
|
26
28
|
|
27
29
|
def is_mixed_case?
|
28
|
-
return 1.0 if @string =~ /[A-Z]/
|
30
|
+
return 1.0 if @string =~ (/[A-Z]/) && @string =~ (/[a-z]/)
|
29
31
|
|
30
32
|
0.0
|
31
33
|
end
|
@@ -1,46 +1,148 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'minitest/autorun'
|
2
4
|
require './lib/known_item_search_classifier'
|
3
5
|
|
4
6
|
class KnownItemSearchClassifierTest < Minitest::Test
|
5
7
|
classifier = KnownItemSearchClassifier::Classifier.new
|
6
8
|
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
'
|
14
|
-
'
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
'
|
19
|
-
'
|
20
|
-
'
|
21
|
-
'
|
9
|
+
known_items = [
|
10
|
+
'0036-8075',
|
11
|
+
'9781324033356',
|
12
|
+
'British Documents on the End of Empire',
|
13
|
+
'9780393979503',
|
14
|
+
'9781319339425',
|
15
|
+
'D810.W7 A5313 2017b',
|
16
|
+
'9783031512117',
|
17
|
+
'9780470044377',
|
18
|
+
'9780393609615',
|
19
|
+
'AC8 .B4353 1979',
|
20
|
+
# 'Kafka metamorphosis',
|
21
|
+
# 'how children develop',
|
22
|
+
# 'milton\'s paradise lost',
|
23
|
+
'9780471140260',
|
24
|
+
# 'Fortaleciendo la participación política de las mujeres',
|
25
|
+
# 'historia augusta',
|
26
|
+
# 'karl marx letters',
|
27
|
+
# 'stuck in place',
|
28
|
+
'Court and Country: Studies in Tudor Social History',
|
29
|
+
'E184.6 .L48 2009',
|
30
|
+
'JK274 .K62 1995',
|
31
|
+
'JZ1480 .H68 2013',
|
32
|
+
'TD186.5.B35 A4413 2016',
|
33
|
+
'Women and gender issues in Bolivia, II, 1991-2003.',
|
34
|
+
# 'arrow of god',
|
35
|
+
# 'going stealth',
|
36
|
+
# 'greek dictionary brill',
|
37
|
+
# 'historia de los indios motolinia',
|
38
|
+
'9780143115267',
|
39
|
+
'9780321573513',
|
40
|
+
'9788498956160',
|
41
|
+
'AC25 .B3132 1972b',
|
42
|
+
'Agrarian issues in Bolivia, I, 1989-2004.',
|
43
|
+
'Indigenous peoples, peasants, and ethnic minorities in Bolivia, I, 1970-2005.',
|
44
|
+
'MT6 .C57 2016',
|
45
|
+
# 'Plato\'s symposium',
|
46
|
+
'le droit a la ville',
|
47
|
+
# 'providence journal',
|
48
|
+
'the atlantic',
|
49
|
+
'9780137605521',
|
50
|
+
'9788876768170',
|
51
|
+
# 'Book of Mormon',
|
52
|
+
# 'Euclid\'s elements'
|
22
53
|
]
|
23
|
-
|
54
|
+
known_items.each do |query|
|
24
55
|
cleaned_up_query = query.gsub(/[[:punct:]]/, '').gsub(/[[:space:]]/, '_')
|
25
56
|
define_method("test_#{cleaned_up_query}_is_classified_as_known_item") do
|
26
57
|
assert_equal(:known, classifier.is_known_item_search?(query).to_sym)
|
27
58
|
end
|
28
59
|
end
|
29
60
|
|
30
|
-
|
31
|
-
'
|
32
|
-
'
|
33
|
-
'
|
34
|
-
'
|
35
|
-
'
|
36
|
-
'
|
37
|
-
|
38
|
-
'
|
39
|
-
'
|
40
|
-
'
|
41
|
-
'
|
61
|
+
unknown_items = [
|
62
|
+
'"paul parin"',
|
63
|
+
'architecture new orleans',
|
64
|
+
'Al-alusi',
|
65
|
+
'animals NOT coin / ',
|
66
|
+
'hanne darboven',
|
67
|
+
'William Burroughs',
|
68
|
+
'Platonic eros',
|
69
|
+
'cavafy',
|
70
|
+
'Marcel proust',
|
71
|
+
'Zisterzienser',
|
72
|
+
'costume',
|
73
|
+
'cultural delegation',
|
74
|
+
'Akhir al-Zaman',
|
75
|
+
'Heidegger',
|
76
|
+
'Nietzsche',
|
77
|
+
'Wind in poetry',
|
78
|
+
'argentina',
|
79
|
+
'facades architecture',
|
80
|
+
'medieval women',
|
81
|
+
'monrovia urbanization',
|
82
|
+
'philadelphia',
|
83
|
+
'roman theater',
|
84
|
+
'Holderline',
|
85
|
+
'Wittgenstein',
|
86
|
+
'hindi history politics',
|
87
|
+
'post-industrial society',
|
88
|
+
'Luxemburg',
|
89
|
+
'Tasso',
|
90
|
+
'Women in Scandinavia',
|
91
|
+
'corporate governance',
|
92
|
+
'dance technique',
|
93
|
+
'water atlas',
|
94
|
+
'Diskin Clay',
|
95
|
+
'charlotte posenenske',
|
96
|
+
'dinosaur',
|
97
|
+
'paul rudolph',
|
98
|
+
'urdu hindi history',
|
99
|
+
'Ashrat al',
|
100
|
+
'Ephemera',
|
101
|
+
'John Hopfield',
|
102
|
+
'Russian formalism',
|
103
|
+
'artificial intelligence',
|
104
|
+
'bats',
|
105
|
+
# 'coffee and the american revolution',
|
106
|
+
'cold war middle east',
|
107
|
+
'environmental history ethiopia',
|
108
|
+
'indigenous',
|
109
|
+
'politbiuro',
|
110
|
+
'stoicism',
|
111
|
+
'suetonius',
|
112
|
+
'tokyo edo',
|
113
|
+
'washington dc',
|
114
|
+
'Achaean war',
|
115
|
+
'Isabella piccini',
|
116
|
+
'Renato Sollima',
|
117
|
+
'animals',
|
118
|
+
'auguste leroux',
|
119
|
+
'blockchain',
|
120
|
+
'britain navy diary',
|
121
|
+
'fairy',
|
122
|
+
'jose guadalupe posada',
|
123
|
+
'lesbian',
|
124
|
+
'physics textbook',
|
125
|
+
'walter benjamin',
|
126
|
+
'"Middlebury College Museum of Art"',
|
127
|
+
'"daniel catan"',
|
128
|
+
'Hellenistic kingdoms',
|
129
|
+
'Albrecht Classen',
|
130
|
+
'Athir al-din al-abhari',
|
131
|
+
'Berlin',
|
132
|
+
'David konstan',
|
133
|
+
'Elizabeth Cary Mariam',
|
134
|
+
'Euclid',
|
135
|
+
'George sand on literature',
|
136
|
+
'Kyrgyz art',
|
137
|
+
'Slavic folklore',
|
138
|
+
'mass incarceration california',
|
139
|
+
'military history korean war',
|
140
|
+
'snakes',
|
141
|
+
'sondheim',
|
142
|
+
'"bronx"',
|
143
|
+
'Between Hermon and Sinai'
|
42
144
|
]
|
43
|
-
|
145
|
+
unknown_items.each do |query|
|
44
146
|
cleaned_up_query = query.gsub(/[[:punct:]]/, '').gsub(/[[:space:]]/, '_')
|
45
147
|
define_method("test_#{cleaned_up_query}_is_not_false_positive") do
|
46
148
|
assert_equal(:unknown, classifier.is_known_item_search?(query).to_sym)
|