warclight 0.6.3 → 0.7.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +1 -0
- data/CHANGELOG.md +3 -0
- data/Gemfile +1 -1
- data/app/assets/images/blacklight/logo.png +0 -0
- data/app/assets/javascripts/warclight/warclight.js +1 -0
- data/app/assets/stylesheets/warclight/application.scss +2 -1
- data/lib/generators/warclight/install_generator.rb +5 -0
- data/lib/generators/warclight/templates/catalog_controller.rb +8 -9
- data/lib/warclight/version.rb +1 -1
- data/solr/conf/elevate.xml +10 -14
- data/solr/conf/lang/contractions_fr.txt +0 -6
- data/solr/conf/lang/stopwords_da.txt +0 -2
- data/solr/conf/lang/stopwords_de.txt +0 -2
- data/solr/conf/lang/stopwords_es.txt +0 -2
- data/solr/conf/lang/stopwords_fi.txt +0 -2
- data/solr/conf/lang/stopwords_fr.txt +1 -4
- data/solr/conf/lang/stopwords_hu.txt +0 -2
- data/solr/conf/lang/stopwords_it.txt +0 -2
- data/solr/conf/lang/stopwords_nl.txt +0 -2
- data/solr/conf/lang/stopwords_no.txt +0 -2
- data/solr/conf/lang/stopwords_pt.txt +0 -2
- data/solr/conf/lang/stopwords_ru.txt +0 -2
- data/solr/conf/lang/stopwords_sv.txt +0 -2
- data/solr/conf/schema.xml +543 -304
- data/solr/conf/solrconfig.xml +1933 -140
- data/solr/conf/solrcore.properties +1 -0
- data/solr/conf/stopwords_path.txt +7 -0
- data/solr/conf/synonyms.txt +13 -13
- data/template.rb +1 -4
- data/warclight.gemspec +2 -0
- metadata +33 -3
- data/app/assets/stylesheets/warclight/warclight.scss +0 -1
data/solr/conf/schema.xml
CHANGED
@@ -16,349 +16,588 @@
|
|
16
16
|
limitations under the License.
|
17
17
|
-->
|
18
18
|
|
19
|
+
<!--
|
20
|
+
This schema is for Solr 7+ and will not work under Solr 6.
|
21
|
+
-->
|
22
|
+
|
19
23
|
<schema name="ukwa" version="1.6">
|
20
|
-
|
21
|
-
|
22
|
-
<field name="
|
23
|
-
<field name="
|
24
|
-
<field name="
|
24
|
+
<fields>
|
25
|
+
<!-- Solr special purpose meta-fields. Explicit attributes to be sure they are set correctly -->
|
26
|
+
<field name="id" type="string" indexed="true" stored="true" docValues="true" required="true" />
|
27
|
+
<field name="_version_" type="long" indexed="true" stored="true" docValues="true" />
|
28
|
+
<field name="_root_" type="string" indexed="true" stored="true" docValues="true"/>
|
29
|
+
<field name="_text_" type="text_general" multiValued="true" /> <!-- Isn't this only used for schema-less? -->
|
25
30
|
|
26
|
-
<!--
|
27
|
-
|
31
|
+
<!-- The time of document indexing. Set automatically by Solr.
|
32
|
+
Sample use: Freezing a query result even when new documents are added to the index:
|
33
|
+
q=foo&fq=index_time:[* TO 2018-05-16T10:33:00Z]
|
34
|
+
Sample use: Discover new documents added since last check for new documents:
|
35
|
+
q=*:*&fq=index_time:[2018-05-16T10:33:00Z TO *] -->
|
36
|
+
<field name="index_time" type="date" default="NOW" />
|
28
37
|
|
29
|
-
<!-- BL UKWA:
|
30
|
-
<field name="access_terms"
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
<field name="
|
36
|
-
|
37
|
-
|
38
|
-
<field name="
|
39
|
-
|
40
|
-
|
41
|
-
<field name="
|
42
|
-
<field name="
|
43
|
-
|
44
|
-
|
45
|
-
<field name="
|
46
|
-
|
47
|
-
|
48
|
-
<field name="
|
49
|
-
|
50
|
-
|
51
|
-
<field name="
|
52
|
-
|
53
|
-
|
54
|
-
<field name="
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
<field name="
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
<field name="
|
64
|
-
|
65
|
-
|
66
|
-
<field name="
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
<field name="
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
<field name="
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
<field name="
|
81
|
-
<field name="
|
82
|
-
|
83
|
-
|
84
|
-
<field name="
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
<field name="
|
90
|
-
|
91
|
-
|
92
|
-
<field name="
|
93
|
-
|
94
|
-
<field name="
|
95
|
-
|
96
|
-
|
97
|
-
<field name="
|
98
|
-
|
99
|
-
<field name="
|
100
|
-
|
101
|
-
<field name="
|
102
|
-
|
103
|
-
<field name="
|
104
|
-
|
105
|
-
<field name="
|
106
|
-
|
107
|
-
<field name="
|
108
|
-
|
109
|
-
|
110
|
-
<field name="
|
111
|
-
|
112
|
-
<field name="
|
113
|
-
|
114
|
-
|
115
|
-
<field name="
|
116
|
-
|
117
|
-
<field name="
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
<field name="
|
122
|
-
|
123
|
-
|
124
|
-
<field name="
|
125
|
-
|
126
|
-
|
38
|
+
<!-- BL UKWA: Access flag (i.e. Open Access or not) -->
|
39
|
+
<field name="access_terms" type="string" multiValued="true" />
|
40
|
+
|
41
|
+
<!-- Author extracted from HTML meta-fields, Word documents meta data, image Exif etc.
|
42
|
+
Search directly in the author-field is verbatim and thus not very usable for user-defines queries.
|
43
|
+
Sample use: Faceting with facet=true&facet.field=author -->
|
44
|
+
<field name="author" type="string" />
|
45
|
+
|
46
|
+
<!-- Does not seem to be used as of 20180516 -->
|
47
|
+
<field name="category" type="text_general" />
|
48
|
+
|
49
|
+
<!-- Institution-specific collection names. Can be specified when calling the indexer -->
|
50
|
+
<field name="collection" type="string" multiValued="true" /> <!-- Why is this multi-valued? -->
|
51
|
+
<field name="collections" type="string" multiValued="true" />
|
52
|
+
|
53
|
+
<!-- Does not seem to be used as of 20180516 -->
|
54
|
+
<field name="comments" type="text_general" multiValued="true" />
|
55
|
+
|
56
|
+
<!-- Dublin Core description tag from HTML pages -->
|
57
|
+
<field name="description" type="text_general" />
|
58
|
+
|
59
|
+
<!-- hashtags and other keywords -->
|
60
|
+
<field name="keywords" type="text_general" multiValued="true" />
|
61
|
+
|
62
|
+
<!-- Licence URL as specified on HTML pages using links with rel=license -->
|
63
|
+
<field name="license_url" type="string" multiValued="true" />
|
64
|
+
|
65
|
+
<!-- The core content of the resource (all text with tags stripped from HTML pages, the text in a Word document)
|
66
|
+
Note: This field not searchable. Use 'text' for search.
|
67
|
+
Sample use: Highlighting with q=floodgate&hl=true&hl.field=content -->
|
68
|
+
<field name="content" type="text_general" indexed="false" />
|
69
|
+
|
70
|
+
<!-- The original encoding of the content (UTF-8/ISO-8859-1/Windows-1250...)
|
71
|
+
Note: Irregardless of the original encoding, content is always converted to UTF-8 in the Solr document -->
|
72
|
+
<field name="content_encoding" type="string" />
|
73
|
+
|
74
|
+
<!-- The first 4 bytes of the content, represented as lower-case hex with no space -->
|
75
|
+
<field name="content_ffb" type="string" />
|
76
|
+
|
77
|
+
<!-- The first 32 bytes of the content, represented as shingled space-separated lower-case hex.
|
78
|
+
Sample use: Locate sub-sequences of bytes within the first 32 bytes (signature search):
|
79
|
+
content_first_bytes:"89 50 4e 47" locates content which is probably PNG -->
|
80
|
+
<field name="content_first_bytes" type="hex_text_shingle" />
|
81
|
+
|
82
|
+
<!-- Language as detected by Tika.
|
83
|
+
Sample use: Faceting on language with facet=true&facet.field=content_language -->
|
84
|
+
<field name="content_language" type="string" />
|
85
|
+
|
86
|
+
<!-- The content length measured in bytes.
|
87
|
+
Sample use: Sort by content size with sort=content_length desc
|
88
|
+
Sample use: Size statistics for the full result set: stats=true&stats.field=content_length -->
|
89
|
+
<field name="content_length" type="int" />
|
90
|
+
<!-- <field name="content_metadata_ss" type="string" multiValued="true" />--> <!-- Not used for anything -->
|
91
|
+
|
92
|
+
<!-- If warc.index.tika.extract_all_metadata was enabled during indexing, Tika metadata is added here. -->
|
93
|
+
<field name="content_metadata" type="text_general" />
|
94
|
+
|
95
|
+
<!-- The content length measured in characters. Mostly relevant for text-based formats (html, doc, pdf...).
|
96
|
+
Sample use: Sort by text length with sort=content_text_length desc
|
97
|
+
Sample use: Size statistics for the full result set: stats=true&stats.field=content_text_length -->
|
98
|
+
<field name="content_text_length" type="int" />
|
99
|
+
|
100
|
+
<!-- The MIME content type as determined by DROID -->
|
101
|
+
<field name="content_type_droid" type="string" />
|
102
|
+
<!-- The file extension: my.sample.png will yield 'png' -->
|
103
|
+
<field name="content_type_ext" type="string" />
|
104
|
+
<!-- Best-guess MIME-type for the content, based on droid, Tika, WARC-header, HTTP-header and
|
105
|
+
webarchive-discovery processing -->
|
106
|
+
<field name="content_type_full" type="string" />
|
107
|
+
<!-- Content type represented as low-cardinality human-readable text: image, video, text etc. -->
|
108
|
+
<field name="content_type_norm" type="string" default="other" />
|
109
|
+
<!-- The MIME content type as specified by the web server the resource was harvested from -->
|
110
|
+
<field name="content_type_served" type="string" />
|
111
|
+
<!-- The MIME content type as determined by Tika -->
|
112
|
+
<field name="content_type_tika" type="string" />
|
113
|
+
<!-- Not clear what this is. TODO: Determine what it is -->
|
114
|
+
<field name="content_type" type="string" /> <!-- Used to be multi-valued -->
|
115
|
+
<!-- The version for the MIME type, if available -->
|
116
|
+
<field name="content_type_version" type="string" />
|
117
|
+
|
118
|
+
<!-- The HTML elements used if the resource is a HTML page -->
|
119
|
+
<field name="elements_used" type="string" multiValued="true" />
|
120
|
+
<!-- Hash of the content (SHA1) -->
|
121
|
+
<field name="hash" type="string" />
|
122
|
+
|
123
|
+
<!-- Does not seem to be used as of 20180516 -->
|
124
|
+
<field name="hashes" type="string" multiValued="true" />
|
125
|
+
<!-- Does not seem to be used as of 20180516 -->
|
126
|
+
<field name="id_long" type="long" />
|
127
|
+
|
128
|
+
<!-- The date represented as a long in the form of YYYYmmddHHMMSS, which is compatible with Wayback.
|
129
|
+
The field is not searchable. Use crawl_date for search and general processing -->
|
130
|
+
<field name="wayback_date" type="long" indexed="false" stored="true" docValues="false" />
|
131
|
+
<!-- If webarchive-discovery runs in update-mode, multiple harvests of the same URL will be collapsed to
|
132
|
+
a single document and the dates from the different harvests will be added to this field -->
|
133
|
+
<field name="crawl_dates" type="date" stored="true" docValues="false" multiValued="true" />
|
134
|
+
<!-- The crawl-date as specified in the WARC.
|
135
|
+
Sample use: Faceting by date with
|
136
|
+
facet=true&facet.range=crawl_date&facet.range.start=2010-01-01T00:00:00Z&facet.range.end=2019-01-01T00:00:00Z&facet.range.gap=+1MONTH&facet.range.method=dv
|
137
|
+
Sample use: Sorting newest material first: sort=crawl_date desc
|
138
|
+
-->
|
139
|
+
<field name="crawl_date" type="date" />
|
140
|
+
<!-- month_day & day not used for anything -->
|
141
|
+
<!-- <field name="crawl_year_month_day" type="int" />
|
142
|
+
<field name="crawl_year_month" type="int" />-->
|
143
|
+
<!-- If webarchive-discovery runs in update-mode, multiple harvests of the same URL will be collapsed to
|
144
|
+
a single document and the years from the dates from the different harvests will be added to this field -->
|
145
|
+
<field name="crawl_years" type="int" multiValued="true" />
|
146
|
+
<!-- The year extracted from crawl_date. Faster than crawl_date if used for faceting.
|
147
|
+
Sample use: Faceting by year with facet.field=crawl_year&facet.sort=index&facet=true -->
|
148
|
+
<field name="crawl_year" type="int" />
|
149
|
+
<!-- Last modified timestamp extracted from the resource. Sources such as JPEG images, PDF files and Word
|
150
|
+
document often has this.
|
151
|
+
Note: This is not a very reliable timestamp for most formats. JPEGs tend to work quite well.
|
152
|
+
Sample use: Sorting by age as stated in the format sort=last_modified asc -->
|
153
|
+
<field name="last_modified" type="date" />
|
154
|
+
<!-- The year from last_modified -->
|
155
|
+
<field name="last_modified_year" type="string" /> <!-- Why is this a string? -->
|
156
|
+
|
157
|
+
<!-- Heavily normalised URL: http/https is collepsed to http, everything is lowercased, trailing / are removed
|
158
|
+
for all URLs, except those pointing to root, e.g. "http://example.com/". There is more processing than
|
159
|
+
that. If the field is to be queried with a user-provided URL, it is highly recommended to use the method
|
160
|
+
Normalisation.canonicaliseURL() from webarchive-discovery to ensure match.
|
161
|
+
This field matches normalisation with the links-field, making it possible to perform graph traversals.
|
162
|
+
Note: This field has very high cardinality (a little less than the number of documents in the index).
|
163
|
+
Faceting should be done with care and is likely to fail with an OutOfMemoryException on a large index
|
164
|
+
using stock Solr. This problem is being worked on by Toke Eskildsen - toes@kb.dk -->
|
165
|
+
<field name="url_norm" type="string" />
|
166
|
+
<!-- Variation of url_norm intended for search for partial URLs.
|
167
|
+
Sample use: Search for large images with q=url_search:"images/large" -->
|
168
|
+
<field name="url_search" type="path" stored="false" /> <!-- search only to save space-->
|
169
|
+
<!-- Path-only for the URL: http://example.com/foo/bar.png becomes /foo/bar.png -->
|
170
|
+
<field name="url_path" type="string" />
|
171
|
+
<!-- Original URL, as specified in the WARC header. Not analysed and thus likely to give false negatives
|
172
|
+
if searched directly with user-input. Consider using url_norm for searching.
|
173
|
+
Note: This field has very high cardinality (a little less than the number of documents in the index).
|
174
|
+
Faceting should be done with care and is likely to fail with an OutOfMemoryException on a large index
|
175
|
+
using stock Solr. This problem is being worked on by Toke Eskildsen - toes@kb.dk -->
|
176
|
+
<field name="url" type="string" />
|
177
|
+
<!-- Possible values: normal, robots.txt and slashpage (root page for the domain).
|
178
|
+
Sample use: Search only for root pages with q=url_type:slashpage -->
|
179
|
+
<field name="url_type" type="string" />
|
180
|
+
<!-- The domain from the URL. The domain is the short name, registered by the domain owner.
|
181
|
+
This can be coupled with the links_domains field for building graphs.
|
182
|
+
Sample use: Faceting to show most popular domains with facet=true&facet.field=domain -->
|
183
|
+
<field name="domain" type="string" />
|
184
|
+
<!-- The host from the URL. The ending of the host is always the same as the domain, with optional prefix,
|
185
|
+
e.g. a host can be foo.bar.zoo.example.com or just example.com for the domain example.com.
|
186
|
+
Sample use: Faceting to show most popular hosts with facet=true&facet.field=host -->
|
187
|
+
<field name="host" type="string" />
|
188
|
+
<!-- The host from the URL in SURT'ed form: http://webarchivingbucket.com/techblog/?p=48
|
189
|
+
e.g. foo.bar.dk becomes the three values ["(dk,", "(dk,bar,", "(dk,bar,foo"]. -->
|
190
|
+
<field name="host_surt" type="string" multiValued="true" />
|
191
|
+
<!-- The part below the domain in the URL. For all dk-domains it will be dk. For domains such
|
192
|
+
as myname.blogspot.com and mycompany.co.uk it will be blogspot.com and co.uk. -->
|
193
|
+
<field name="public_suffix" type="string" />
|
194
|
+
<!-- The last part of the URL, typically a filename, e.g. giant_rabbitFoot.png. The field is analysed with an
|
195
|
+
aggressive tokenizer, so that giant_rabbitFoot.png is split into [giant, rabbit, foot, png] and searches
|
196
|
+
are not dependent on knowing file extensions etc.
|
197
|
+
See resourcename_facet for sorting, grouping and faceting.
|
198
|
+
Sample use: Search for images of kittens: q=resourcename:kittens&fq=content_type_norm:image -->
|
199
|
+
<field name="resourcename" type="path" />
|
200
|
+
<!-- Mirror of resourcename intended for sorting, grouping and faceting.
|
201
|
+
Important note: This is a high-cardinality field. Faceting on a web archive with billions of records
|
202
|
+
will likely lead to memory problems. This problem is being worked on by Toke Eskildsen - toes@kb.dk -->
|
203
|
+
<field name="resourcename_facet" type="string" />
|
204
|
+
|
205
|
+
<!-- Does not seem to be used as of 20180516 -->
|
206
|
+
<field name="image_colours" type="string" multiValued="true" />
|
207
|
+
<!-- If warc.index.extract.content.images.dominantColours was true during indexing, this field will contain
|
208
|
+
the dominant colour if the resource is an image. The colour is a human readable name, e.g. crimson,
|
209
|
+
icory or goldenrod, as defined by https://www.w3.org/TR/SVG/types.html#ColorKeywords -->
|
210
|
+
<field name="image_dominant_colour" type="string" />
|
211
|
+
<!-- If warc.index.extract.content.images.detectFaces was true during indexing, this will contain the number
|
212
|
+
of faces detected if the resource is an image.
|
213
|
+
Note: Face recognition is heavy and it is recommended not to enable it unless the need is high -->
|
214
|
+
<field name="image_faces_count" type="int" />
|
215
|
+
<!-- If warc.index.extract.content.images.detectFaces was true during indexing, this will contain the faces
|
216
|
+
detected if the resource is an image. A face is represented by a bounding box relative to the original
|
217
|
+
image.
|
218
|
+
Note: Face recognition is heavy and it is recommended not to enable it unless the need is high -->
|
219
|
+
<field name="image_faces" type="string" indexed="false" stored="true" docValues="false" multiValued="true" />
|
220
|
+
<!-- Image height in pixels.
|
221
|
+
Sample use: Get statistics for image height with stats=true&stats.field=image_height -->
|
222
|
+
<field name="image_height" type="long" />
|
223
|
+
<!-- Image size in pixels (width*height).
|
224
|
+
Sample use: Get statistics for image size with stats=true&stats.field=image_size
|
225
|
+
Sample use: Locate largest images with sort=image_size desc -->
|
226
|
+
<field name="image_size" type="long" />
|
227
|
+
<!-- Image width in pixels.
|
228
|
+
Sample use: Get statistics for image width with stats=true&stats.field=image_width -->
|
229
|
+
<field name="image_width" type="long" />
|
230
|
+
|
231
|
+
<!-- Links to images shown on a given web page (aka embedded images).
|
232
|
+
Normalised the same way as url_norm -->
|
233
|
+
<field name="links_images" type="string" multiValued="true" />
|
234
|
+
<!-- domains from outgoing links for a HTML page -->
|
235
|
+
<field name="links_domains" type="string" multiValued="true" />
|
236
|
+
<!-- hosts from outgoing links for a HTML page -->
|
237
|
+
<field name="links_hosts" type="string" multiValued="true" />
|
238
|
+
<!-- SORT'ed form of hosts (see the host_surt field) from outgoing links for a HTML page -->
|
239
|
+
<field name="links_hosts_surts" type="string" multiValued="true" />
|
240
|
+
<!-- Might be used in the future but will take up a lot of space (same as 'links') -->
|
241
|
+
<!-- <field name="links_norm" type="string" multiValued="true" />-->
|
242
|
+
<!-- public suffixes (see public_suffix field) from outgoing links for a HTML page -->
|
243
|
+
<field name="links_public_suffixes" type="string" multiValued="true" />
|
244
|
+
<!-- Links to external (i.e. not images and other embedded content).
|
245
|
+
Normalised the same way as url_norm
|
246
|
+
Note: This field has extremely high cardinality (10 times the number of documents in the index).
|
247
|
+
Faceting should be done with care and is highly likely to fail with an OutOfMemoryException even on a
|
248
|
+
medium sized index using stock Solr. This problem is being worked on by Toke Eskildsen - toes@kb.dk -->
|
249
|
+
<field name="links" type="string" multiValued="true" />
|
250
|
+
|
251
|
+
<!-- Geographical coordinates, extracted from postcodes.
|
252
|
+
Sample use: Find images taken within a given radius for from a given geo location with Solr geodist search
|
253
|
+
q=({!geofilt sfield=exif_location}) AND *:*&pt=56.17,10.20&d=0.8
|
254
|
+
where the coordinates can be retrieved from e.g. Google Maps, where they are shown at the bottom of
|
255
|
+
the map when a location is clicked. d is distance in kilometers.
|
256
|
+
See https://lucene.apache.org/solr/guide/7_3/spatial-search.html for details -->
|
257
|
+
<field name="locations" type="location" multiValued="true" />
|
258
|
+
|
259
|
+
<!-- Non-fatal errors during mete data extraction as part of indexing -->
|
260
|
+
<field name="parse_error" type="string" multiValued="true" />
|
261
|
+
<!-- If warc.index.extract.content.extractApachePreflightErrors is true during indexing, this field will
|
262
|
+
contain errors encountered during PDF/A-validation -->
|
263
|
+
<field name="pdf_pdfa_errors" type="string" multiValued="true" />
|
264
|
+
<!-- If warc.index.extract.content.extractApachePreflightErrors is true during indexing, this field will
|
265
|
+
be true id the resource was a PDF and a valid PDF/A.
|
266
|
+
Note: PDF validation is heavy and it is recommended not to enable it unless the need is high-->
|
267
|
+
<field name="pdf_pdfa_is_valid" type="string" />
|
268
|
+
<!-- UK postcodes only, as they are easily recognizable -->
|
269
|
+
<field name="postcode_district" type="string" multiValued="true" />
|
270
|
+
<!-- UK postcodes only, as they are easily recognizable -->
|
271
|
+
<field name="postcode" type="string" multiValued="true" />
|
272
|
+
|
273
|
+
<!-- Does not seem to be used as of 20180516 -->
|
274
|
+
<field name="publication_date" type="date" />
|
275
|
+
<!-- Does not seem to be used as of 20180516 -->
|
276
|
+
<field name="publication_year" type="string" />
|
277
|
+
<!-- The source format. Currently arc or warc. This might be extended in the future -->
|
278
|
+
<field name="record_type" type="string" />
|
279
|
+
<!-- If warc.index.extract.content.text_sentimentj was true during indexing, this field will contain a
|
280
|
+
numeric score for the sentiment, with 0.0 being "very negative" and high values being "very positive" -->
|
281
|
+
<field name="sentiment_score" type="float" />
|
282
|
+
<!-- If warc.index.extract.content.text_sentimentj was true during indexing, this field will contain a
|
283
|
+
human readable assessment of the sentiment, from "very negative" to "very positive" -->
|
284
|
+
<field name="sentiment" type="string" />
|
285
|
+
|
286
|
+
<!-- The HTTP server as stated in the HTTP-headers -->
|
287
|
+
<field name="server" type="string" multiValued="true" />
|
288
|
+
<!-- Status-code for the resource, as stated in the HTTP-headers from the originating web server -->
|
289
|
+
<field name="status_code" type="int" />
|
290
|
+
<!-- The generator for the resource, e.g. Wordpress or Photoshop -->
|
291
|
+
<field name="generator" type="string" multiValued="true" />
|
292
|
+
<!-- Does not seem to be used as of 20180516 -->
|
293
|
+
<field name="referrer_url" type="string" />
|
294
|
+
<!-- If the resource is returned with a 3xx HTTP response code, it is a redirection. This field contains
|
295
|
+
the URL that the resource redirects to, normalised like url_norm -->
|
296
|
+
<field name="redirect_to_norm" type="string" />
|
297
|
+
|
298
|
+
<!-- The full path of the origin container (typically WARC) for the harvested resource, e.g.
|
299
|
+
/harvests/full/2018-05/myharvest_20180516_1706.warc.gz
|
300
|
+
Sample use: Delivery of the raw resource by seeking to the source_file_offset in source_file and
|
301
|
+
reading the resource bytes (the size of the resource is stated in the headers in the source_file) -->
|
302
|
+
<field name="source_file_path" type="string" />
|
303
|
+
<!-- The offset for the resource within the source_file (aka WARC).
|
304
|
+
Sample use: Delivery of the raw resource by seeking to the source_file_offset in source_file and
|
305
|
+
reading the resource bytes (the size of the resource is stated in the headers in the source_file) -->
|
306
|
+
<field name="source_file_offset" type="long" /> <!-- docValues as it will probably be used for streaming export -->
|
307
|
+
<!-- The file name of the origin container (typically WARC) for the harvested resource, e.g.
|
308
|
+
myharvest_20180516_1706.warc.gz
|
309
|
+
Sample use: Delivery of the raw resource by seeking to the source_file_offset in source_file and
|
310
|
+
reading the resource bytes (the size of the resource is stated in the headers in the source_file) -->
|
311
|
+
<field name="source_file" type="string" />
|
312
|
+
|
313
|
+
<!-- Catch-all search field. All text content is copied here. -->
|
314
|
+
<field name="text" type="text_general" stored="false" multiValued="true" /> <!-- Catch-all -->
|
315
|
+
<!-- HTML page <title>, Word document title, Dublic Core title, etc -->
|
316
|
+
<field name="title" type="text_general" />
|
317
|
+
<!-- Variant of content_type_norm with human readable designations for the content type -->
|
318
|
+
<field name="type" type="string" />
|
319
|
+
|
320
|
+
<!-- Meta data from Web Curator Tool -->
|
321
|
+
<field name="wct_agency" type="string" />
|
322
|
+
<field name="wct_collections" type="string" multiValued="true" />
|
323
|
+
<field name="wct_description" type="text_general" />
|
324
|
+
<field name="wct_instance_id" type="int" indexed="true" stored="true" docValues="false" />
|
325
|
+
<field name="wct_subjects" type="string" multiValued="true" />
|
326
|
+
<field name="wct_target_id" type="string" />
|
327
|
+
<field name="wct_title" type="string" />
|
328
|
+
|
329
|
+
<!-- Root namespace for XML files.
|
330
|
+
Sample use: Facet to get most popular XML formats with facet=true&facet.field=xml_root_ns -->
|
331
|
+
<field name="xml_root_ns" type="string" />
|
332
|
+
<!-- WARC-Record-ID if available -->
|
333
|
+
<field name="warc_key_id" type="string" />
|
334
|
+
<!-- WARC-IP-Address if available -->
|
335
|
+
<field name="warc_ip" type="string" />
|
336
|
+
|
337
|
+
<!-- Geographical coordinates, extracted from image Exif data.
|
338
|
+
Sample use: Find images taken within a given radius from a given geo location with Solr geodist search
|
339
|
+
q=({!geofilt sfield=exif_location}) AND *:*&pt=56.17,10.20&d=0.8
|
340
|
+
where the coordinates can be retrieved from e.g. Google Maps, where they are shown at the bottom of
|
341
|
+
the map when a location is clicked. d is distance in kilometers.
|
342
|
+
See https://lucene.apache.org/solr/guide/7_3/spatial-search.html for details -->
|
343
|
+
<field name="exif_location" type="location" />
|
344
|
+
<!-- The Exif version (Exchangeable image file format) -->
|
345
|
+
<field name="exif_version" type="string" stored="true" docValues="false" />
|
346
|
+
|
347
|
+
<!-- Fuzzy matching on text for similarity search.
|
348
|
+
If warc.index.extract.content.text_fuzzy_hash is true during indexing, fields for SSDeep hashes will
|
349
|
+
be created. See https://ssdeep-project.github.io/ssdeep/ for details -->
|
350
|
+
<dynamicField name="ssdeep_hash_bs_*" type="string" stored="true" docValues="false" />
|
351
|
+
<!-- Does not seem to be used as of 20180517 -->
|
352
|
+
<dynamicField name="ssdeep_hash_ngram_bs_*" type="literal_ngram" stored="true" />
|
127
353
|
|
128
354
|
<!-- User supplied Archive-It fields: -->
|
129
|
-
<field name="institution"
|
130
|
-
<field name="collection_id"
|
355
|
+
<field name="institution" type="string" />
|
356
|
+
<field name="collection_id" type="string" />
|
131
357
|
<!--:User supplied Archive-It fields -->
|
132
358
|
|
133
|
-
|
134
|
-
|
135
|
-
<
|
136
|
-
<
|
137
|
-
<
|
138
|
-
<
|
139
|
-
<
|
140
|
-
<
|
141
|
-
<dynamicField name="*_b" type="boolean" indexed="true" stored="true"/>
|
142
|
-
<dynamicField name="*_bs" type="booleans" indexed="true" stored="true"/>
|
143
|
-
<dynamicField name="*_f" type="float" indexed="true" stored="true"/>
|
144
|
-
<dynamicField name="*_fs" type="floats" indexed="true" stored="true"/>
|
145
|
-
<dynamicField name="*_d" type="double" indexed="true" stored="true"/>
|
146
|
-
<dynamicField name="*_ds" type="doubles" indexed="true" stored="true"/>
|
147
|
-
<dynamicField name="*_coordinate" type="tdouble" indexed="true" stored="false" />
|
148
|
-
<dynamicField name="*_dt" type="date" indexed="true" stored="true"/>
|
149
|
-
<dynamicField name="*_dts" type="date" indexed="true" stored="true" multiValued="true"/>
|
150
|
-
<dynamicField name="*_p" type="location" indexed="true" stored="true"/>
|
151
|
-
<dynamicField name="*_ti" type="tint" indexed="true" stored="true"/>
|
152
|
-
<dynamicField name="*_tis" type="tints" indexed="true" stored="true"/>
|
153
|
-
<dynamicField name="*_tl" type="tlong" indexed="true" stored="true"/>
|
154
|
-
<dynamicField name="*_tls" type="tlongs" indexed="true" stored="true"/>
|
155
|
-
<dynamicField name="*_tf" type="tfloat" indexed="true" stored="true"/>
|
156
|
-
<dynamicField name="*_tfs" type="tfloats" indexed="true" stored="true"/>
|
157
|
-
<dynamicField name="*_td" type="tdouble" indexed="true" stored="true"/>
|
158
|
-
<dynamicField name="*_tds" type="tdoubles" indexed="true" stored="true"/>
|
159
|
-
<dynamicField name="*_tdt" type="tdate" indexed="true" stored="true"/>
|
160
|
-
<dynamicField name="*_tdts" type="tdates" indexed="true" stored="true"/>
|
161
|
-
<dynamicField name="ignored_*" type="ignored" multiValued="true"/>
|
162
|
-
<dynamicField name="attr_*" type="text_general" indexed="true" stored="true" multiValued="true"/>
|
163
|
-
<dynamicField name="random_*" type="random" />
|
164
|
-
|
165
|
-
<!--:IMAGE EXIF-->
|
166
|
-
<field name="exif_location" type="location" indexed="true" stored="true" multiValued="false"/>
|
167
|
-
<field name="exif_version" type="string" indexed="true" stored="true" multiValued="false"/>
|
359
|
+
<!-- Harvest meta-data derived from WARC file names using regexp-rules defined in the warc-indexer config file.
|
360
|
+
Primarily used by the Royal Danish Library -->
|
361
|
+
<field name="arc_full" type="string" stored="true" docValues="false" />
|
362
|
+
<field name="arc_name" type="string" />
|
363
|
+
<field name="arc_orig" type="string" />
|
364
|
+
<field name="arc_job" type="string" />
|
365
|
+
<field name="arc_harvest" type="string" />
|
366
|
+
<field name="arc_harvesttime" type="string" />
|
168
367
|
|
169
|
-
<!--
|
170
|
-
|
171
|
-
|
172
|
-
|
368
|
+
<!-- Dynamic fields intended for intstitution-specific fields without changing the schema.
|
369
|
+
(yes, the arc_*-fields above should have been dynamic fields instead of hardcoded)
|
370
|
+
TODO: Add DocValues-enabled variants (take care not to change existing definitions) -->
|
371
|
+
<dynamicField name="*_i" type="int" indexed="true" stored="true" />
|
372
|
+
<dynamicField name="*_is" type="int" indexed="true" stored="true" multiValued="true" />
|
373
|
+
<dynamicField name="*_s" type="string" indexed="true" stored="true" />
|
374
|
+
<dynamicField name="*_ss" type="string" indexed="true" stored="true" multiValued="true" />
|
375
|
+
<dynamicField name="*_l" type="long" indexed="true" stored="true" />
|
376
|
+
<dynamicField name="*_ls" type="long" indexed="true" stored="true" multiValued="true" />
|
377
|
+
<dynamicField name="*_t" type="text_general" indexed="true" stored="true" />
|
378
|
+
<dynamicField name="*_txt" type="text_general" indexed="true" stored="true" />
|
379
|
+
<dynamicField name="*_b" type="boolean" indexed="true" stored="true" />
|
380
|
+
<dynamicField name="*_bs" type="boolean" indexed="true" stored="true" multiValued="true" />
|
381
|
+
<dynamicField name="*_f" type="float" indexed="true" stored="true" />
|
382
|
+
<dynamicField name="*_fs" type="float" indexed="true" stored="true" multiValued="true" />
|
383
|
+
<dynamicField name="*_d" type="double" indexed="true" stored="true" />
|
384
|
+
<dynamicField name="*_ds" type="double" indexed="true" stored="true" multiValued="true" />
|
385
|
+
<dynamicField name="*_coordinate" type="double" indexed="true" stored="false" />
|
386
|
+
<dynamicField name="*_dt" type="date" indexed="true" stored="true" />
|
387
|
+
<dynamicField name="*_dts" type="date" indexed="true" stored="true" multiValued="true" />
|
388
|
+
<dynamicField name="*_p" type="location" indexed="true" stored="true" />
|
389
|
+
<dynamicField name="*_ti" type="int" indexed="true" stored="true" />
|
390
|
+
<dynamicField name="*_tis" type="int" indexed="true" stored="true" multiValued="true" />
|
391
|
+
<dynamicField name="*_tl" type="long" indexed="true" stored="true" />
|
392
|
+
<dynamicField name="*_tls" type="long" indexed="true" stored="true" multiValued="true" />
|
393
|
+
<dynamicField name="*_tf" type="float" indexed="true" stored="true" />
|
394
|
+
<dynamicField name="*_tfs" type="float" indexed="true" stored="true" multiValued="true" />
|
395
|
+
<dynamicField name="*_td" type="double" indexed="true" stored="true" />
|
396
|
+
<dynamicField name="*_tds" type="double" indexed="true" stored="true" multiValued="true" />
|
397
|
+
<dynamicField name="*_tdt" type="date" indexed="true" stored="true" />
|
398
|
+
<dynamicField name="*_tdts" type="date" indexed="true" stored="true" multiValued="true" />
|
399
|
+
<dynamicField name="ignored_*" type="ignored" multiValued="true" />
|
400
|
+
<dynamicField name="attr_*" type="text_general" indexed="true" stored="true" multiValued="true" />
|
401
|
+
<dynamicField name="random_*" type="random" />
|
402
|
+
|
403
|
+
<dynamicField name="*_ws" type="text_ws" indexed="true" stored="true" />
|
404
|
+
<dynamicField name="*_txt_en" type="text_en" indexed="true" stored="true" />
|
405
|
+
<dynamicField name="*_txt_en_split" type="text_en_splitting" indexed="true" stored="true" />
|
406
|
+
<dynamicField name="*_txt_en_split_tight" type="text_en_splitting_tight" indexed="true" stored="true" />
|
407
|
+
<dynamicField name="*_txt_rev" type="text_general_rev" indexed="true" stored="true" />
|
408
|
+
<dynamicField name="*_phon_en" type="phonetic_en" indexed="true" stored="true" />
|
409
|
+
<dynamicField name="*_s_lower" type="lowercase" indexed="true" stored="true" />
|
410
|
+
<dynamicField name="*_descendent_path" type="descendent_path" indexed="true" stored="true" />
|
411
|
+
<dynamicField name="*_ancestor_path" type="ancestor_path" indexed="true" stored="true" />
|
412
|
+
<dynamicField name="*_point" type="point" indexed="true" stored="true" />
|
413
|
+
<dynamicField name="*_txt_ga" type="text_ga" indexed="true" stored="true" />
|
414
|
+
</fields>
|
415
|
+
|
416
|
+
<uniqueKey>id</uniqueKey>
|
417
|
+
|
418
|
+
<!-- TODO: Remove all copyFields where the source is indexed as text and adjust solrconfig.xml
|
419
|
+
to also search in those fields (edismax parser qf) -->
|
420
|
+
<copyField source="author" dest="text" />
|
421
|
+
<copyField source="keywords" dest="text" />
|
422
|
+
<copyField source="wct_title" dest="text" />
|
423
|
+
<copyField source="wct_description" dest="text" />
|
424
|
+
<copyField source="content" dest="text" />
|
425
|
+
<copyField source="url_norm" dest="url_search" />
|
426
|
+
<copyField source="resourcename" dest="resourcename_facet"/>
|
427
|
+
|
428
|
+
<types>
|
429
|
+
<!-- Guiding principles:
|
430
|
+
|
431
|
+
Atomic types are single-valued indexed & docValues, but not stored. This allows for low-cost faceting,
|
432
|
+
grouping and sorting. The downside is a performance penalty on document retrieval where a full document
|
433
|
+
takes longer to retrieve. Enabling stored speeds up retrieval at the cost of increased index size.
|
434
|
+
|
435
|
+
Text types are single-valued indexed & stored, but not docValued (DV is not currently possible for Text).
|
436
|
+
|
437
|
+
Deviations are normally handled by overriding for the specific fields
|
438
|
+
-->
|
439
|
+
|
440
|
+
<fieldType name="string" class="solr.StrField" indexed="true" docValues="true" stored="false" multiValued="false" sortMissingLast="true" />
|
441
|
+
<fieldType name="boolean" class="solr.BoolField" indexed="true" docValues="true" stored="false" multiValued="false" sortMissingLast="true" />
|
442
|
+
<fieldType name="int" class="solr.IntPointField" indexed="true" docValues="true" stored="false" multiValued="false" sortMissingLast="true" />
|
443
|
+
<fieldType name="float" class="solr.FloatPointField" indexed="true" docValues="true" stored="false" multiValued="false" sortMissingLast="true" />
|
444
|
+
<fieldType name="long" class="solr.LongPointField" indexed="true" docValues="true" stored="false" multiValued="false" sortMissingLast="true" />
|
445
|
+
<fieldType name="double" class="solr.DoublePointField" indexed="true" docValues="true" stored="false" multiValued="false" sortMissingLast="true" />
|
446
|
+
<fieldType name="date" class="solr.DatePointField" indexed="true" docValues="true" stored="false" multiValued="false" sortMissingLast="true" />
|
447
|
+
<fieldType name="binary" class="solr.BinaryField" indexed="false" docValues="false" stored="true" multiValued="false" />
|
448
|
+
<fieldType name="random" class="solr.RandomSortField" />
|
449
|
+
|
450
|
+
<fieldType name="text_ws" class="solr.TextField" positionIncrementGap="100" indexed="true" stored="true" multiValued="false">
|
451
|
+
<analyzer>
|
452
|
+
<tokenizer class="solr.WhitespaceTokenizerFactory" />
|
453
|
+
</analyzer>
|
454
|
+
</fieldType>
|
173
455
|
|
174
|
-
|
175
|
-
<
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
<copyField source="title" dest="text"/>
|
190
|
-
<copyField source="author" dest="text"/>
|
191
|
-
<copyField source="keywords" dest="text"/>
|
192
|
-
<copyField source="description" dest="text"/>
|
193
|
-
<copyField source="wct_title" dest="text"/>
|
194
|
-
<copyField source="wct_description" dest="text"/>
|
195
|
-
<copyField source="url" dest="text"/>
|
196
|
-
<copyField source="content" dest="text"/>
|
197
|
-
<copyField source="resourcename" dest="resourcename_facet"/>
|
198
|
-
|
199
|
-
<types>
|
200
|
-
<fieldType name="string" class="solr.StrField" sortMissingLast="true" />
|
201
|
-
<fieldType name="strings" class="solr.StrField" sortMissingLast="true" multiValued="true" docValues="true" />
|
202
|
-
<fieldType name="boolean" class="solr.BoolField" sortMissingLast="true"/>
|
203
|
-
<fieldType name="booleans" class="solr.BoolField" sortMissingLast="true" multiValued="true"/>
|
204
|
-
<fieldType name="int" class="solr.TrieIntField" docValues="true" precisionStep="0" positionIncrementGap="0"/>
|
205
|
-
<fieldType name="float" class="solr.TrieFloatField" docValues="true" precisionStep="0" positionIncrementGap="0"/>
|
206
|
-
<fieldType name="long" class="solr.TrieLongField" docValues="true" precisionStep="0" positionIncrementGap="0"/>
|
207
|
-
<fieldType name="double" class="solr.TrieDoubleField" docValues="true" precisionStep="0" positionIncrementGap="0"/>
|
208
|
-
<fieldType name="ints" class="solr.TrieIntField" docValues="true" precisionStep="0" positionIncrementGap="0" multiValued="true"/>
|
209
|
-
<fieldType name="floats" class="solr.TrieFloatField" docValues="true" precisionStep="0" positionIncrementGap="0" multiValued="true"/>
|
210
|
-
<fieldType name="longs" class="solr.TrieLongField" docValues="true" precisionStep="0" positionIncrementGap="0" multiValued="true"/>
|
211
|
-
<fieldType name="doubles" class="solr.TrieDoubleField" docValues="true" precisionStep="0" positionIncrementGap="0" multiValued="true"/>
|
212
|
-
<fieldType name="tint" class="solr.TrieIntField" docValues="true" precisionStep="8" positionIncrementGap="0"/>
|
213
|
-
<fieldType name="tfloat" class="solr.TrieFloatField" docValues="true" precisionStep="8" positionIncrementGap="0"/>
|
214
|
-
<fieldType name="tlong" class="solr.TrieLongField" docValues="true" precisionStep="8" positionIncrementGap="0"/>
|
215
|
-
<fieldType name="tdouble" class="solr.TrieDoubleField" docValues="true" precisionStep="8" positionIncrementGap="0"/>
|
216
|
-
<fieldType name="tints" class="solr.TrieIntField" docValues="true" precisionStep="8" positionIncrementGap="0" multiValued="true"/>
|
217
|
-
<fieldType name="tfloats" class="solr.TrieFloatField" docValues="true" precisionStep="8" positionIncrementGap="0" multiValued="true"/>
|
218
|
-
<fieldType name="tlongs" class="solr.TrieLongField" docValues="true" precisionStep="8" positionIncrementGap="0" multiValued="true"/>
|
219
|
-
<fieldType name="tdoubles" class="solr.TrieDoubleField" docValues="true" precisionStep="8" positionIncrementGap="0" multiValued="true"/>
|
220
|
-
<fieldType name="date" class="solr.TrieDateField" docValues="true" precisionStep="0" positionIncrementGap="0"/>
|
221
|
-
<fieldType name="dates" class="solr.TrieDateField" docValues="true" precisionStep="0" positionIncrementGap="0" multiValued="true"/>
|
222
|
-
<fieldType name="tdate" class="solr.TrieDateField" docValues="true" precisionStep="6" positionIncrementGap="0"/>
|
223
|
-
<fieldType name="tdates" class="solr.TrieDateField" docValues="true" precisionStep="6" positionIncrementGap="0" multiValued="true"/>
|
224
|
-
<fieldType name="binary" class="solr.BinaryField"/>
|
225
|
-
<fieldType name="random" class="solr.RandomSortField" indexed="true" />
|
226
|
-
|
227
|
-
<fieldType name="text_ws" class="solr.TextField" positionIncrementGap="100">
|
228
|
-
<analyzer>
|
229
|
-
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
230
|
-
</analyzer>
|
456
|
+
<!-- Used for parsing file paths, so that ["MOO BOO/FooBar_zoo.baz"] becomes ["moo", "boo", "foo", "bar", "zoo", "baz"] -->
|
457
|
+
<fieldType name="path" class="solr.TextField" positionIncrementGap="100" indexed="true" stored="true" multiValued="false">
|
458
|
+
<analyzer type="index">
|
459
|
+
<tokenizer class="solr.StandardTokenizerFactory" />
|
460
|
+
<filter class="solr.WordDelimiterFilterFactory" preserveOriginal="0" />
|
461
|
+
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords_path.txt" />
|
462
|
+
<filter class="solr.LowerCaseFilterFactory" />
|
463
|
+
</analyzer>
|
464
|
+
<analyzer type="query">
|
465
|
+
<tokenizer class="solr.StandardTokenizerFactory" />
|
466
|
+
<filter class="solr.WordDelimiterFilterFactory" preserveOriginal="0" />
|
467
|
+
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords_path.txt" />
|
468
|
+
<filter class="solr.LowerCaseFilterFactory" />
|
469
|
+
</analyzer>
|
231
470
|
</fieldType>
|
232
471
|
|
233
|
-
<fieldType name="text_general" class="solr.TextField" positionIncrementGap="100"
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
472
|
+
<fieldType name="text_general" class="solr.TextField" positionIncrementGap="100" indexed="true" stored="true" multiValued="false">
|
473
|
+
<analyzer type="index">
|
474
|
+
<tokenizer class="solr.StandardTokenizerFactory" />
|
475
|
+
<filter class="solr.LowerCaseFilterFactory" />
|
476
|
+
</analyzer>
|
477
|
+
<analyzer type="query">
|
478
|
+
<tokenizer class="solr.StandardTokenizerFactory" />
|
479
|
+
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true" />
|
480
|
+
<filter class="solr.LowerCaseFilterFactory" />
|
481
|
+
</analyzer>
|
243
482
|
</fieldType>
|
244
483
|
|
245
|
-
<fieldType name="text_en" class="solr.TextField" positionIncrementGap="100">
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
484
|
+
<fieldType name="text_en" class="solr.TextField" positionIncrementGap="100" indexed="true" stored="true" multiValued="false">
|
485
|
+
<analyzer type="index">
|
486
|
+
<tokenizer class="solr.StandardTokenizerFactory" />
|
487
|
+
<filter class="solr.LowerCaseFilterFactory" />
|
488
|
+
<filter class="solr.EnglishPossessiveFilterFactory" />
|
489
|
+
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt" />
|
490
|
+
</analyzer>
|
491
|
+
<analyzer type="query">
|
492
|
+
<tokenizer class="solr.StandardTokenizerFactory" />
|
493
|
+
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true" />
|
494
|
+
<filter class="solr.LowerCaseFilterFactory" />
|
495
|
+
<filter class="solr.EnglishPossessiveFilterFactory" />
|
496
|
+
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt" />
|
497
|
+
</analyzer>
|
259
498
|
</fieldType>
|
260
499
|
|
261
|
-
<fieldType name="text_en_splitting" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
500
|
+
<fieldType name="text_en_splitting" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true" indexed="true" stored="true" multiValued="false">
|
501
|
+
<analyzer type="index">
|
502
|
+
<tokenizer class="solr.WhitespaceTokenizerFactory" />
|
503
|
+
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1" />
|
504
|
+
<filter class="solr.LowerCaseFilterFactory" />
|
505
|
+
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt" />
|
506
|
+
</analyzer>
|
507
|
+
<analyzer type="query">
|
508
|
+
<tokenizer class="solr.WhitespaceTokenizerFactory" />
|
509
|
+
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true" />
|
510
|
+
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1" />
|
511
|
+
<filter class="solr.LowerCaseFilterFactory" />
|
512
|
+
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt" />
|
513
|
+
</analyzer>
|
275
514
|
</fieldType>
|
276
515
|
|
277
|
-
<fieldType name="text_en_splitting_tight" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
516
|
+
<fieldType name="text_en_splitting_tight" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true" indexed="true" stored="true" multiValued="false">
|
517
|
+
<analyzer>
|
518
|
+
<tokenizer class="solr.WhitespaceTokenizerFactory" />
|
519
|
+
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false" />
|
520
|
+
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0" />
|
521
|
+
<filter class="solr.LowerCaseFilterFactory" />
|
522
|
+
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt" />
|
523
|
+
<filter class="solr.EnglishMinimalStemFilterFactory" />
|
524
|
+
<filter class="solr.RemoveDuplicatesTokenFilterFactory" />
|
525
|
+
</analyzer>
|
287
526
|
</fieldType>
|
288
527
|
|
289
|
-
<fieldType name="text_general_rev" class="solr.TextField" positionIncrementGap="100">
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
528
|
+
<fieldType name="text_general_rev" class="solr.TextField" positionIncrementGap="100" indexed="true" stored="true" multiValued="false">
|
529
|
+
<analyzer type="index">
|
530
|
+
<tokenizer class="solr.StandardTokenizerFactory" />
|
531
|
+
<filter class="solr.LowerCaseFilterFactory" />
|
532
|
+
<filter class="solr.ReversedWildcardFilterFactory" withOriginal="true"
|
533
|
+
maxPosAsterisk="3" maxPosQuestion="2" maxFractionAsterisk="0.33" />
|
534
|
+
</analyzer>
|
535
|
+
<analyzer type="query">
|
536
|
+
<tokenizer class="solr.StandardTokenizerFactory" />
|
537
|
+
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true" />
|
538
|
+
<filter class="solr.LowerCaseFilterFactory" />
|
539
|
+
</analyzer>
|
301
540
|
</fieldType>
|
302
541
|
|
303
|
-
<fieldType name="phonetic_en"
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
542
|
+
<fieldType name="phonetic_en" class="solr.TextField" indexed="true" stored="true" multiValued="false">
|
543
|
+
<analyzer>
|
544
|
+
<tokenizer class="solr.StandardTokenizerFactory" />
|
545
|
+
<filter class="solr.DoubleMetaphoneFilterFactory" inject="false" />
|
546
|
+
</analyzer>
|
308
547
|
</fieldType>
|
309
548
|
|
310
|
-
<fieldType name="lowercase" class="solr.TextField" positionIncrementGap="100">
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
549
|
+
<fieldType name="lowercase" class="solr.TextField" positionIncrementGap="100" indexed="true" stored="true" multiValued="false">
|
550
|
+
<analyzer>
|
551
|
+
<tokenizer class="solr.KeywordTokenizerFactory" />
|
552
|
+
<filter class="solr.LowerCaseFilterFactory" />
|
553
|
+
</analyzer>
|
315
554
|
</fieldType>
|
316
555
|
|
317
556
|
<fieldType name="descendent_path" class="solr.TextField">
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
557
|
+
<analyzer type="index">
|
558
|
+
<tokenizer class="solr.PathHierarchyTokenizerFactory" delimiter="/" />
|
559
|
+
</analyzer>
|
560
|
+
<analyzer type="query">
|
561
|
+
<tokenizer class="solr.KeywordTokenizerFactory" />
|
562
|
+
</analyzer>
|
324
563
|
</fieldType>
|
325
564
|
|
326
565
|
<fieldType name="ancestor_path" class="solr.TextField">
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
566
|
+
<analyzer type="index">
|
567
|
+
<tokenizer class="solr.KeywordTokenizerFactory" />
|
568
|
+
</analyzer>
|
569
|
+
<analyzer type="query">
|
570
|
+
<tokenizer class="solr.PathHierarchyTokenizerFactory" delimiter="/" />
|
571
|
+
</analyzer>
|
333
572
|
</fieldType>
|
334
573
|
|
335
574
|
<fieldType name="ignored" stored="false" indexed="false" docValues="false" multiValued="true" class="solr.StrField" />
|
336
|
-
<fieldType name="point" class="solr.PointType" dimension="2" subFieldSuffix="_d"/>
|
337
|
-
<fieldType name="location" class="solr.LatLonPointSpatialField" docValues="true"/>
|
575
|
+
<fieldType name="point" class="solr.PointType" dimension="2" subFieldSuffix="_d" />
|
576
|
+
<fieldType name="location" class="solr.LatLonPointSpatialField" indexed="true" stored="false" docValues="true" multiValued="false" />
|
338
577
|
|
339
578
|
<fieldType name="text_ga" class="solr.TextField" positionIncrementGap="100">
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
579
|
+
<analyzer>
|
580
|
+
<tokenizer class="solr.StandardTokenizerFactory" />
|
581
|
+
<filter class="solr.ElisionFilterFactory" ignoreCase="true" articles="lang/contractions_ga.txt" />
|
582
|
+
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/hyphenations_ga.txt" />
|
583
|
+
<filter class="solr.IrishLowerCaseFilterFactory" />
|
584
|
+
</analyzer>
|
346
585
|
</fieldType>
|
347
586
|
|
348
587
|
<!-- BL UKWA: additional -->
|
349
|
-
<fieldType name="literal_ngram"
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
588
|
+
<fieldType name="literal_ngram" class="solr.TextField" indexed="true" stored="false" multiValued="false" >
|
589
|
+
<analyzer>
|
590
|
+
<tokenizer class="solr.WhitespaceTokenizerFactory" />
|
591
|
+
<filter class="solr.NGramFilterFactory" minGramSize="2" maxGramSize="5" />
|
592
|
+
</analyzer>
|
354
593
|
</fieldType>
|
355
594
|
|
356
|
-
<fieldType name="hex_text_shingle" class="solr.TextField" positionIncrementGap="100">
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
595
|
+
<fieldType name="hex_text_shingle" class="solr.TextField" positionIncrementGap="100" indexed="true" stored="true" docValues="false">
|
596
|
+
<analyzer>
|
597
|
+
<tokenizer class="solr.WhitespaceTokenizerFactory" />
|
598
|
+
<filter class="solr.ShingleFilterFactory" minShingleSize="4" maxShingleSize="8" outputUnigrams="false" outputUnigramsIfNoShingles="false" tokenSeparator=" " />
|
599
|
+
</analyzer>
|
361
600
|
</fieldType>
|
362
601
|
<!--:BL UKWA -->
|
363
|
-
|
602
|
+
</types>
|
364
603
|
</schema>
|