warclight 0.6.3 → 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
data/solr/conf/schema.xml CHANGED
@@ -16,349 +16,588 @@
16
16
  limitations under the License.
17
17
  -->
18
18
 
19
+ <!--
20
+ This schema is for Solr 7+ and will not work under Solr 6.
21
+ -->
22
+
19
23
  <schema name="ukwa" version="1.6">
20
- <fields>
21
- <field name="id" type="string" indexed="true" stored="true" required="true" multiValued="false" docValues="true"/>
22
- <field name="_version_" type="long" indexed="true" docValues="true"/>
23
- <field name="_root_" type="string" indexed="true" docValues="true" />
24
- <field name="_text_" type="text_general" indexed="true" stored="false" multiValued="true"/>
24
+ <fields>
25
+ <!-- Solr special purpose meta-fields. Explicit attributes to be sure they are set correctly -->
26
+ <field name="id" type="string" indexed="true" stored="true" docValues="true" required="true" />
27
+ <field name="_version_" type="long" indexed="true" stored="true" docValues="true" />
28
+ <field name="_root_" type="string" indexed="true" stored="true" docValues="true"/>
29
+ <field name="_text_" type="text_general" multiValued="true" /> <!-- Isn't this only used for schema-less? -->
25
30
 
26
- <!--Not defined in SolrFields. Schema only defintion -->
27
- <field name="index_time" type="date" indexed="true" stored="false" docValues="true" default="NOW" />
31
+ <!-- The time of document indexing. Set automatically by Solr.
32
+ Sample use: Freezing a query result even when new documents are added to the index:
33
+ q=foo&fq=index_time:[* TO 2018-05-16T10:33:00Z]
34
+ Sample use: Discover new documents added since last check for new documents:
35
+ q=*:*&fq=index_time:[2018-05-16T10:33:00Z TO *] -->
36
+ <field name="index_time" type="date" default="NOW" />
28
37
 
29
- <!-- BL UKWA: additional -->
30
- <field name="access_terms" type="string" indexed="true" docValues="true" multiValued="true"/>
31
- <field name="author" type="string" indexed="true" docValues="true" multiValued="true"/>
32
- <field name="category" type="text_general" indexed="true" stored="true"/>
33
- <field name="collection" type="string" indexed="true" docValues="true" multiValued="true"/>
34
- <field name="collections" type="string" indexed="true" docValues="true" multiValued="true"/>
35
- <field name="comments" type="text_general" indexed="true" stored="true"/>
36
- <field name="content_encoding" type="string" indexed="true" docValues="true" multiValued="false"/>
37
- <field name="content_ffb" type="string" indexed="true" docValues="true" multiValued="false"/>
38
- <field name="content_first_bytes" type="hex_text_shingle" indexed="true" stored="true" multiValued="false"/>
39
- <field name="content_language" type="string" indexed="true" docValues="true" multiValued="false"/>
40
- <field name="content_length" type="tint" indexed="true" stored="false" multiValued="false" docValues="true"/>
41
- <field name="content_metadata_ss" type="string" indexed="true" docValues="true" multiValued="true"/>
42
- <field name="content_metadata" type="text_general" indexed="true" stored="true" multiValued="false"/>
43
- <field name="content_text_length" type="tint" indexed="true" stored="false" multiValued="false" docValues="true"/>
44
- <field name="content_type_droid" type="string" indexed="true" docValues="true" multiValued="false"/>
45
- <field name="content_type_ext" type="string" indexed="true" docValues="true" multiValued="false"/>
46
- <field name="content_type_full" type="string" indexed="true" docValues="true" multiValued="false"/>
47
- <field name="content_type_norm" type="string" indexed="true" docValues="true" multiValued="false" default="other"/>
48
- <field name="content_type_served" type="string" indexed="true" docValues="true" multiValued="false"/>
49
- <field name="content" type="text_general" indexed="true" stored="true" multiValued="true"/>
50
- <field name="content_type_tika" type="string" indexed="true" docValues="true" multiValued="false"/>
51
- <field name="content_type" type="string" indexed="true" docValues="true" multiValued="true"/>
52
- <field name="content_type_version" type="string" indexed="true" docValues="true" multiValued="false"/>
53
- <field name="crawl_dates" type="tdate" indexed="true" stored="true" multiValued="true"/>
54
- <field name="crawl_date" type="tdate" indexed="true" stored="false" multiValued="false" docValues="true"/>
55
- <field name="crawl_year_month_day" type="int" indexed="true" docValues="true" multiValued="false"/>
56
- <field name="crawl_year_month" type="int" indexed="true" docValues="true" multiValued="false"/>
57
- <field name="crawl_years" type="int" indexed="true" docValues="true" multiValued="true"/>
58
- <field name="crawl_year" type="int" indexed="true" docValues="true" multiValued="false"/>
59
- <field name="description" type="text_general" indexed="true" stored="true"/>
60
- <field name="domain" type="string" indexed="true" docValues="true" multiValued="false"/>
61
- <field name="elements_used" type="string" indexed="true" docValues="true" multiValued="true"/>
62
- <field name="generator" type="string" indexed="true" docValues="true" multiValued="true"/>
63
- <field name="hash" type="string" indexed="true" docValues="true" multiValued="false"/>
64
- <field name="hashes" type="string" indexed="true" docValues="true" multiValued="true"/>
65
- <field name="host" type="string" indexed="true" docValues="true" multiValued="false"/>
66
- <field name="host_surt" type="string" indexed="true" docValues="true" multiValued="true"/>
67
- <field name="id_long" type="long" indexed="true" stored="true" multiValued="false"/>
68
- <field name="image_colours" type="string" indexed="true" stored="true" multiValued="true"/>
69
- <field name="image_dominant_colour" type="string" indexed="true" stored="true" multiValued="false"/>
70
- <field name="image_faces_count" type="tint" indexed="true" stored="true" multiValued="false"/>
71
- <field name="image_faces" type="string" indexed="false" stored="true" multiValued="true"/>
72
- <field name="image_height" type="tlong" indexed="true" stored="true" multiValued="false"/>
73
- <field name="image_size" type="tlong" indexed="true" stored="true" multiValued="false"/>
74
- <field name="image_width" type="tlong" indexed="true" stored="true" multiValued="false"/>
75
- <field name="keywords" type="text_general" indexed="true" stored="true"/>
76
- <field name="last_modified" type="tdate" indexed="true" stored="true" docValues="true"/>
77
- <field name="last_modified_year" type="string" indexed="true" docValues="true"/>
78
- <field name="license_url" type="string" indexed="true" docValues="true" multiValued="true"/>
79
- <field name="links_images" type="text_general" indexed="true" stored="true" multiValued="true"/>
80
- <field name="links_domains" type="string" indexed="true" docValues="true" multiValued="true"/>
81
- <field name="links_hosts" type="string" indexed="true" docValues="true" multiValued="true"/>
82
- <field name="links_hosts_surts" type="string" indexed="true" docValues="true" multiValued="true"/>
83
- <field name="links_norm" type="string" indexed="true" docValues="true" multiValued="true"/>
84
- <field name="links_public_suffixes" type="string" indexed="true" docValues="true" multiValued="true"/>
85
- <field name="links" type="string" indexed="true" docValues="true" multiValued="true"/>
86
- <field name="locations" type="location" indexed="true" stored="true" multiValued="true"/>
87
- <field name="parse_error" type="string" indexed="true" docValues="true" multiValued="true"/>
88
- <field name="pdf_pdfa_errors" type="text_general" indexed="false" stored="true" multiValued="true"/>
89
- <field name="pdf_pdfa_is_valid" type="string" indexed="true" docValues="true" multiValued="false"/>
90
- <field name="postcode_district" type="string" indexed="true" docValues="true" multiValued="true"/>
91
- <field name="postcode" type="string" indexed="true" docValues="true" multiValued="true"/>
92
- <field name="publication_date" type="tdate" indexed="true" stored="true" multiValued="false"/>
93
- <field name="publication_year" type="string" indexed="true" docValues="true" multiValued="false"/>
94
- <field name="public_suffix" type="string" indexed="true" docValues="true" multiValued="false"/>
95
- <field name="record_type" type="string" indexed="true" stored="true" multiValued="false" docValues="true"/>
96
- <field name="redirect_to_norm" type="string" indexed="true" stored="false" docValues="true" multiValued="false"/>
97
- <field name="referrer_url" type="string" indexed="true" stored="true" multiValued="false" docValues="true"/>
98
- <field name="resourcename" type="text_general" indexed="true" stored="true"/>
99
- <field name="resourcename_facet" type="string" indexed="true" multiValued="false" docValues="true"/>
100
- <field name="sentiment_score" type="float" indexed="true" stored="true" multiValued="false"/>
101
- <field name="sentiment" type="string" indexed="true" docValues="true" multiValued="false"/>
102
- <field name="server" type="string" indexed="true" docValues="true" multiValued="true"/>
103
- <field name="source_file_path" type="string" indexed="true" docValues="true" />
104
- <field name="source_file_offset" type="tlong" indexed="true" stored="true" />
105
- <field name="source_file" type="string" indexed="true" docValues="true" />
106
- <field name="status_code" type="int" indexed="true" stored="true" docValues="true" />
107
- <field name="subject" type="text_general" indexed="true" stored="true" multiValued="true"/>
108
- <field name="text" type="text_general" indexed="true" stored="false" multiValued="true"/>
109
- <field name="title" type="text_general" indexed="true" stored="true" multiValued="false"/>
110
- <field name="type" type="string" indexed="true" docValues="true" multiValued="false"/>
111
- <field name="url_norm" type="string" indexed="true" stored="false" docValues="true" multiValued="false"/>
112
- <field name="url_path" type="string" indexed="true" stored="false" docValues="true" multiValued="false"/>
113
- <field name="url" type="string" indexed="true" stored="false" docValues="true" multiValued="false"/>
114
- <field name="url_type" type="text_general" indexed="true" stored="true"/>
115
- <field name="wayback_date" type="long" indexed="false" stored="true" docValues="false" multiValued="false"/>
116
- <field name="wct_agency" type="string" indexed="true" docValues="true" multiValued="false"/>
117
- <field name="wct_collections" type="string" indexed="true" docValues="true" multiValued="true"/>
118
- <field name="wct_description" type="text_general" indexed="true" stored="true"/>
119
- <field name="wct_instance_id" type="int" indexed="true" stored="true" multiValued="false"/>
120
- <field name="wct_subjects" type="string" indexed="true" docValues="true" multiValued="true"/>
121
- <field name="wct_target_id" type="string" indexed="true" docValues="true" multiValued="false"/>
122
- <field name="wct_title" type="string" indexed="true" docValues="true"/>
123
- <field name="xml_root_ns" type="string" indexed="true" docValues="true" multiValued="false"/>
124
- <field name="warc_key_id" type="string" indexed="true" docValues="true" multiValued="false"/>
125
- <field name="warc_ip" type="string" indexed="true" docValues="true" multiValued="false"/>
126
- <!--:BL UKWA -->
38
+ <!-- BL UKWA: Access flag (i.e. Open Access or not) -->
39
+ <field name="access_terms" type="string" multiValued="true" />
40
+
41
+ <!-- Author extracted from HTML meta-fields, Word documents meta data, image Exif etc.
42
+ Search directly in the author-field is verbatim and thus not very usable for user-defines queries.
43
+ Sample use: Faceting with facet=true&facet.field=author -->
44
+ <field name="author" type="string" />
45
+
46
+ <!-- Does not seem to be used as of 20180516 -->
47
+ <field name="category" type="text_general" />
48
+
49
+ <!-- Institution-specific collection names. Can be specified when calling the indexer -->
50
+ <field name="collection" type="string" multiValued="true" /> <!-- Why is this multi-valued? -->
51
+ <field name="collections" type="string" multiValued="true" />
52
+
53
+ <!-- Does not seem to be used as of 20180516 -->
54
+ <field name="comments" type="text_general" multiValued="true" />
55
+
56
+ <!-- Dublin Core description tag from HTML pages -->
57
+ <field name="description" type="text_general" />
58
+
59
+ <!-- hashtags and other keywords -->
60
+ <field name="keywords" type="text_general" multiValued="true" />
61
+
62
+ <!-- Licence URL as specified on HTML pages using links with rel=license -->
63
+ <field name="license_url" type="string" multiValued="true" />
64
+
65
+ <!-- The core content of the resource (all text with tags stripped from HTML pages, the text in a Word document)
66
+ Note: This field not searchable. Use 'text' for search.
67
+ Sample use: Highlighting with q=floodgate&hl=true&hl.field=content -->
68
+ <field name="content" type="text_general" indexed="false" />
69
+
70
+ <!-- The original encoding of the content (UTF-8/ISO-8859-1/Windows-1250...)
71
+ Note: Irregardless of the original encoding, content is always converted to UTF-8 in the Solr document -->
72
+ <field name="content_encoding" type="string" />
73
+
74
+ <!-- The first 4 bytes of the content, represented as lower-case hex with no space -->
75
+ <field name="content_ffb" type="string" />
76
+
77
+ <!-- The first 32 bytes of the content, represented as shingled space-separated lower-case hex.
78
+ Sample use: Locate sub-sequences of bytes within the first 32 bytes (signature search):
79
+ content_first_bytes:"89 50 4e 47" locates content which is probably PNG -->
80
+ <field name="content_first_bytes" type="hex_text_shingle" />
81
+
82
+ <!-- Language as detected by Tika.
83
+ Sample use: Faceting on language with facet=true&facet.field=content_language -->
84
+ <field name="content_language" type="string" />
85
+
86
+ <!-- The content length measured in bytes.
87
+ Sample use: Sort by content size with sort=content_length desc
88
+ Sample use: Size statistics for the full result set: stats=true&stats.field=content_length -->
89
+ <field name="content_length" type="int" />
90
+ <!-- <field name="content_metadata_ss" type="string" multiValued="true" />--> <!-- Not used for anything -->
91
+
92
+ <!-- If warc.index.tika.extract_all_metadata was enabled during indexing, Tika metadata is added here. -->
93
+ <field name="content_metadata" type="text_general" />
94
+
95
+ <!-- The content length measured in characters. Mostly relevant for text-based formats (html, doc, pdf...).
96
+ Sample use: Sort by text length with sort=content_text_length desc
97
+ Sample use: Size statistics for the full result set: stats=true&stats.field=content_text_length -->
98
+ <field name="content_text_length" type="int" />
99
+
100
+ <!-- The MIME content type as determined by DROID -->
101
+ <field name="content_type_droid" type="string" />
102
+ <!-- The file extension: my.sample.png will yield 'png' -->
103
+ <field name="content_type_ext" type="string" />
104
+ <!-- Best-guess MIME-type for the content, based on droid, Tika, WARC-header, HTTP-header and
105
+ webarchive-discovery processing -->
106
+ <field name="content_type_full" type="string" />
107
+ <!-- Content type represented as low-cardinality human-readable text: image, video, text etc. -->
108
+ <field name="content_type_norm" type="string" default="other" />
109
+ <!-- The MIME content type as specified by the web server the resource was harvested from -->
110
+ <field name="content_type_served" type="string" />
111
+ <!-- The MIME content type as determined by Tika -->
112
+ <field name="content_type_tika" type="string" />
113
+ <!-- Not clear what this is. TODO: Determine what it is -->
114
+ <field name="content_type" type="string" /> <!-- Used to be multi-valued -->
115
+ <!-- The version for the MIME type, if available -->
116
+ <field name="content_type_version" type="string" />
117
+
118
+ <!-- The HTML elements used if the resource is a HTML page -->
119
+ <field name="elements_used" type="string" multiValued="true" />
120
+ <!-- Hash of the content (SHA1) -->
121
+ <field name="hash" type="string" />
122
+
123
+ <!-- Does not seem to be used as of 20180516 -->
124
+ <field name="hashes" type="string" multiValued="true" />
125
+ <!-- Does not seem to be used as of 20180516 -->
126
+ <field name="id_long" type="long" />
127
+
128
+ <!-- The date represented as a long in the form of YYYYmmddHHMMSS, which is compatible with Wayback.
129
+ The field is not searchable. Use crawl_date for search and general processing -->
130
+ <field name="wayback_date" type="long" indexed="false" stored="true" docValues="false" />
131
+ <!-- If webarchive-discovery runs in update-mode, multiple harvests of the same URL will be collapsed to
132
+ a single document and the dates from the different harvests will be added to this field -->
133
+ <field name="crawl_dates" type="date" stored="true" docValues="false" multiValued="true" />
134
+ <!-- The crawl-date as specified in the WARC.
135
+ Sample use: Faceting by date with
136
+ facet=true&facet.range=crawl_date&facet.range.start=2010-01-01T00:00:00Z&facet.range.end=2019-01-01T00:00:00Z&facet.range.gap=+1MONTH&facet.range.method=dv
137
+ Sample use: Sorting newest material first: sort=crawl_date desc
138
+ -->
139
+ <field name="crawl_date" type="date" />
140
+ <!-- month_day & day not used for anything -->
141
+ <!-- <field name="crawl_year_month_day" type="int" />
142
+ <field name="crawl_year_month" type="int" />-->
143
+ <!-- If webarchive-discovery runs in update-mode, multiple harvests of the same URL will be collapsed to
144
+ a single document and the years from the dates from the different harvests will be added to this field -->
145
+ <field name="crawl_years" type="int" multiValued="true" />
146
+ <!-- The year extracted from crawl_date. Faster than crawl_date if used for faceting.
147
+ Sample use: Faceting by year with facet.field=crawl_year&facet.sort=index&facet=true -->
148
+ <field name="crawl_year" type="int" />
149
+ <!-- Last modified timestamp extracted from the resource. Sources such as JPEG images, PDF files and Word
150
+ document often has this.
151
+ Note: This is not a very reliable timestamp for most formats. JPEGs tend to work quite well.
152
+ Sample use: Sorting by age as stated in the format sort=last_modified asc -->
153
+ <field name="last_modified" type="date" />
154
+ <!-- The year from last_modified -->
155
+ <field name="last_modified_year" type="string" /> <!-- Why is this a string? -->
156
+
157
+ <!-- Heavily normalised URL: http/https is collepsed to http, everything is lowercased, trailing / are removed
158
+ for all URLs, except those pointing to root, e.g. "http://example.com/". There is more processing than
159
+ that. If the field is to be queried with a user-provided URL, it is highly recommended to use the method
160
+ Normalisation.canonicaliseURL() from webarchive-discovery to ensure match.
161
+ This field matches normalisation with the links-field, making it possible to perform graph traversals.
162
+ Note: This field has very high cardinality (a little less than the number of documents in the index).
163
+ Faceting should be done with care and is likely to fail with an OutOfMemoryException on a large index
164
+ using stock Solr. This problem is being worked on by Toke Eskildsen - toes@kb.dk -->
165
+ <field name="url_norm" type="string" />
166
+ <!-- Variation of url_norm intended for search for partial URLs.
167
+ Sample use: Search for large images with q=url_search:"images/large" -->
168
+ <field name="url_search" type="path" stored="false" /> <!-- search only to save space-->
169
+ <!-- Path-only for the URL: http://example.com/foo/bar.png becomes /foo/bar.png -->
170
+ <field name="url_path" type="string" />
171
+ <!-- Original URL, as specified in the WARC header. Not analysed and thus likely to give false negatives
172
+ if searched directly with user-input. Consider using url_norm for searching.
173
+ Note: This field has very high cardinality (a little less than the number of documents in the index).
174
+ Faceting should be done with care and is likely to fail with an OutOfMemoryException on a large index
175
+ using stock Solr. This problem is being worked on by Toke Eskildsen - toes@kb.dk -->
176
+ <field name="url" type="string" />
177
+ <!-- Possible values: normal, robots.txt and slashpage (root page for the domain).
178
+ Sample use: Search only for root pages with q=url_type:slashpage -->
179
+ <field name="url_type" type="string" />
180
+ <!-- The domain from the URL. The domain is the short name, registered by the domain owner.
181
+ This can be coupled with the links_domains field for building graphs.
182
+ Sample use: Faceting to show most popular domains with facet=true&facet.field=domain -->
183
+ <field name="domain" type="string" />
184
+ <!-- The host from the URL. The ending of the host is always the same as the domain, with optional prefix,
185
+ e.g. a host can be foo.bar.zoo.example.com or just example.com for the domain example.com.
186
+ Sample use: Faceting to show most popular hosts with facet=true&facet.field=host -->
187
+ <field name="host" type="string" />
188
+ <!-- The host from the URL in SURT'ed form: http://webarchivingbucket.com/techblog/?p=48
189
+ e.g. foo.bar.dk becomes the three values ["(dk,", "(dk,bar,", "(dk,bar,foo"]. -->
190
+ <field name="host_surt" type="string" multiValued="true" />
191
+ <!-- The part below the domain in the URL. For all dk-domains it will be dk. For domains such
192
+ as myname.blogspot.com and mycompany.co.uk it will be blogspot.com and co.uk. -->
193
+ <field name="public_suffix" type="string" />
194
+ <!-- The last part of the URL, typically a filename, e.g. giant_rabbitFoot.png. The field is analysed with an
195
+ aggressive tokenizer, so that giant_rabbitFoot.png is split into [giant, rabbit, foot, png] and searches
196
+ are not dependent on knowing file extensions etc.
197
+ See resourcename_facet for sorting, grouping and faceting.
198
+ Sample use: Search for images of kittens: q=resourcename:kittens&fq=content_type_norm:image -->
199
+ <field name="resourcename" type="path" />
200
+ <!-- Mirror of resourcename intended for sorting, grouping and faceting.
201
+ Important note: This is a high-cardinality field. Faceting on a web archive with billions of records
202
+ will likely lead to memory problems. This problem is being worked on by Toke Eskildsen - toes@kb.dk -->
203
+ <field name="resourcename_facet" type="string" />
204
+
205
+ <!-- Does not seem to be used as of 20180516 -->
206
+ <field name="image_colours" type="string" multiValued="true" />
207
+ <!-- If warc.index.extract.content.images.dominantColours was true during indexing, this field will contain
208
+ the dominant colour if the resource is an image. The colour is a human readable name, e.g. crimson,
209
+ icory or goldenrod, as defined by https://www.w3.org/TR/SVG/types.html#ColorKeywords -->
210
+ <field name="image_dominant_colour" type="string" />
211
+ <!-- If warc.index.extract.content.images.detectFaces was true during indexing, this will contain the number
212
+ of faces detected if the resource is an image.
213
+ Note: Face recognition is heavy and it is recommended not to enable it unless the need is high -->
214
+ <field name="image_faces_count" type="int" />
215
+ <!-- If warc.index.extract.content.images.detectFaces was true during indexing, this will contain the faces
216
+ detected if the resource is an image. A face is represented by a bounding box relative to the original
217
+ image.
218
+ Note: Face recognition is heavy and it is recommended not to enable it unless the need is high -->
219
+ <field name="image_faces" type="string" indexed="false" stored="true" docValues="false" multiValued="true" />
220
+ <!-- Image height in pixels.
221
+ Sample use: Get statistics for image height with stats=true&stats.field=image_height -->
222
+ <field name="image_height" type="long" />
223
+ <!-- Image size in pixels (width*height).
224
+ Sample use: Get statistics for image size with stats=true&stats.field=image_size
225
+ Sample use: Locate largest images with sort=image_size desc -->
226
+ <field name="image_size" type="long" />
227
+ <!-- Image width in pixels.
228
+ Sample use: Get statistics for image width with stats=true&stats.field=image_width -->
229
+ <field name="image_width" type="long" />
230
+
231
+ <!-- Links to images shown on a given web page (aka embedded images).
232
+ Normalised the same way as url_norm -->
233
+ <field name="links_images" type="string" multiValued="true" />
234
+ <!-- domains from outgoing links for a HTML page -->
235
+ <field name="links_domains" type="string" multiValued="true" />
236
+ <!-- hosts from outgoing links for a HTML page -->
237
+ <field name="links_hosts" type="string" multiValued="true" />
238
+ <!-- SORT'ed form of hosts (see the host_surt field) from outgoing links for a HTML page -->
239
+ <field name="links_hosts_surts" type="string" multiValued="true" />
240
+ <!-- Might be used in the future but will take up a lot of space (same as 'links') -->
241
+ <!-- <field name="links_norm" type="string" multiValued="true" />-->
242
+ <!-- public suffixes (see public_suffix field) from outgoing links for a HTML page -->
243
+ <field name="links_public_suffixes" type="string" multiValued="true" />
244
+ <!-- Links to external (i.e. not images and other embedded content).
245
+ Normalised the same way as url_norm
246
+ Note: This field has extremely high cardinality (10 times the number of documents in the index).
247
+ Faceting should be done with care and is highly likely to fail with an OutOfMemoryException even on a
248
+ medium sized index using stock Solr. This problem is being worked on by Toke Eskildsen - toes@kb.dk -->
249
+ <field name="links" type="string" multiValued="true" />
250
+
251
+ <!-- Geographical coordinates, extracted from postcodes.
252
+ Sample use: Find images taken within a given radius for from a given geo location with Solr geodist search
253
+ q=({!geofilt sfield=exif_location}) AND *:*&pt=56.17,10.20&d=0.8
254
+ where the coordinates can be retrieved from e.g. Google Maps, where they are shown at the bottom of
255
+ the map when a location is clicked. d is distance in kilometers.
256
+ See https://lucene.apache.org/solr/guide/7_3/spatial-search.html for details -->
257
+ <field name="locations" type="location" multiValued="true" />
258
+
259
+ <!-- Non-fatal errors during mete data extraction as part of indexing -->
260
+ <field name="parse_error" type="string" multiValued="true" />
261
+ <!-- If warc.index.extract.content.extractApachePreflightErrors is true during indexing, this field will
262
+ contain errors encountered during PDF/A-validation -->
263
+ <field name="pdf_pdfa_errors" type="string" multiValued="true" />
264
+ <!-- If warc.index.extract.content.extractApachePreflightErrors is true during indexing, this field will
265
+ be true id the resource was a PDF and a valid PDF/A.
266
+ Note: PDF validation is heavy and it is recommended not to enable it unless the need is high-->
267
+ <field name="pdf_pdfa_is_valid" type="string" />
268
+ <!-- UK postcodes only, as they are easily recognizable -->
269
+ <field name="postcode_district" type="string" multiValued="true" />
270
+ <!-- UK postcodes only, as they are easily recognizable -->
271
+ <field name="postcode" type="string" multiValued="true" />
272
+
273
+ <!-- Does not seem to be used as of 20180516 -->
274
+ <field name="publication_date" type="date" />
275
+ <!-- Does not seem to be used as of 20180516 -->
276
+ <field name="publication_year" type="string" />
277
+ <!-- The source format. Currently arc or warc. This might be extended in the future -->
278
+ <field name="record_type" type="string" />
279
+ <!-- If warc.index.extract.content.text_sentimentj was true during indexing, this field will contain a
280
+ numeric score for the sentiment, with 0.0 being "very negative" and high values being "very positive" -->
281
+ <field name="sentiment_score" type="float" />
282
+ <!-- If warc.index.extract.content.text_sentimentj was true during indexing, this field will contain a
283
+ human readable assessment of the sentiment, from "very negative" to "very positive" -->
284
+ <field name="sentiment" type="string" />
285
+
286
+ <!-- The HTTP server as stated in the HTTP-headers -->
287
+ <field name="server" type="string" multiValued="true" />
288
+ <!-- Status-code for the resource, as stated in the HTTP-headers from the originating web server -->
289
+ <field name="status_code" type="int" />
290
+ <!-- The generator for the resource, e.g. Wordpress or Photoshop -->
291
+ <field name="generator" type="string" multiValued="true" />
292
+ <!-- Does not seem to be used as of 20180516 -->
293
+ <field name="referrer_url" type="string" />
294
+ <!-- If the resource is returned with a 3xx HTTP response code, it is a redirection. This field contains
295
+ the URL that the resource redirects to, normalised like url_norm -->
296
+ <field name="redirect_to_norm" type="string" />
297
+
298
+ <!-- The full path of the origin container (typically WARC) for the harvested resource, e.g.
299
+ /harvests/full/2018-05/myharvest_20180516_1706.warc.gz
300
+ Sample use: Delivery of the raw resource by seeking to the source_file_offset in source_file and
301
+ reading the resource bytes (the size of the resource is stated in the headers in the source_file) -->
302
+ <field name="source_file_path" type="string" />
303
+ <!-- The offset for the resource within the source_file (aka WARC).
304
+ Sample use: Delivery of the raw resource by seeking to the source_file_offset in source_file and
305
+ reading the resource bytes (the size of the resource is stated in the headers in the source_file) -->
306
+ <field name="source_file_offset" type="long" /> <!-- docValues as it will probably be used for streaming export -->
307
+ <!-- The file name of the origin container (typically WARC) for the harvested resource, e.g.
308
+ myharvest_20180516_1706.warc.gz
309
+ Sample use: Delivery of the raw resource by seeking to the source_file_offset in source_file and
310
+ reading the resource bytes (the size of the resource is stated in the headers in the source_file) -->
311
+ <field name="source_file" type="string" />
312
+
313
+ <!-- Catch-all search field. All text content is copied here. -->
314
+ <field name="text" type="text_general" stored="false" multiValued="true" /> <!-- Catch-all -->
315
+ <!-- HTML page <title>, Word document title, Dublic Core title, etc -->
316
+ <field name="title" type="text_general" />
317
+ <!-- Variant of content_type_norm with human readable designations for the content type -->
318
+ <field name="type" type="string" />
319
+
320
+ <!-- Meta data from Web Curator Tool -->
321
+ <field name="wct_agency" type="string" />
322
+ <field name="wct_collections" type="string" multiValued="true" />
323
+ <field name="wct_description" type="text_general" />
324
+ <field name="wct_instance_id" type="int" indexed="true" stored="true" docValues="false" />
325
+ <field name="wct_subjects" type="string" multiValued="true" />
326
+ <field name="wct_target_id" type="string" />
327
+ <field name="wct_title" type="string" />
328
+
329
+ <!-- Root namespace for XML files.
330
+ Sample use: Facet to get most popular XML formats with facet=true&facet.field=xml_root_ns -->
331
+ <field name="xml_root_ns" type="string" />
332
+ <!-- WARC-Record-ID if available -->
333
+ <field name="warc_key_id" type="string" />
334
+ <!-- WARC-IP-Address if available -->
335
+ <field name="warc_ip" type="string" />
336
+
337
+ <!-- Geographical coordinates, extracted from image Exif data.
338
+ Sample use: Find images taken within a given radius from a given geo location with Solr geodist search
339
+ q=({!geofilt sfield=exif_location}) AND *:*&pt=56.17,10.20&d=0.8
340
+ where the coordinates can be retrieved from e.g. Google Maps, where they are shown at the bottom of
341
+ the map when a location is clicked. d is distance in kilometers.
342
+ See https://lucene.apache.org/solr/guide/7_3/spatial-search.html for details -->
343
+ <field name="exif_location" type="location" />
344
+ <!-- The Exif version (Exchangeable image file format) -->
345
+ <field name="exif_version" type="string" stored="true" docValues="false" />
346
+
347
+ <!-- Fuzzy matching on text for similarity search.
348
+ If warc.index.extract.content.text_fuzzy_hash is true during indexing, fields for SSDeep hashes will
349
+ be created. See https://ssdeep-project.github.io/ssdeep/ for details -->
350
+ <dynamicField name="ssdeep_hash_bs_*" type="string" stored="true" docValues="false" />
351
+ <!-- Does not seem to be used as of 20180517 -->
352
+ <dynamicField name="ssdeep_hash_ngram_bs_*" type="literal_ngram" stored="true" />
127
353
 
128
354
  <!-- User supplied Archive-It fields: -->
129
- <field name="institution" type="string" indexed="true" multiValued="false" docValues="true"/>
130
- <field name="collection_id" type="string" indexed="true" multiValued="false" docValues="true"/>
355
+ <field name="institution" type="string" />
356
+ <field name="collection_id" type="string" />
131
357
  <!--:User supplied Archive-It fields -->
132
358
 
133
- <dynamicField name="*_i" type="int" indexed="true" stored="true"/>
134
- <dynamicField name="*_is" type="ints" indexed="true" stored="true"/>
135
- <dynamicField name="*_s" type="string" indexed="true" stored="true" />
136
- <dynamicField name="*_ss" type="strings" indexed="true" stored="true"/>
137
- <dynamicField name="*_l" type="long" indexed="true" stored="true"/>
138
- <dynamicField name="*_ls" type="longs" indexed="true" stored="true"/>
139
- <dynamicField name="*_t" type="text_general" indexed="true" stored="true"/>
140
- <dynamicField name="*_txt" type="text_general" indexed="true" stored="true"/>
141
- <dynamicField name="*_b" type="boolean" indexed="true" stored="true"/>
142
- <dynamicField name="*_bs" type="booleans" indexed="true" stored="true"/>
143
- <dynamicField name="*_f" type="float" indexed="true" stored="true"/>
144
- <dynamicField name="*_fs" type="floats" indexed="true" stored="true"/>
145
- <dynamicField name="*_d" type="double" indexed="true" stored="true"/>
146
- <dynamicField name="*_ds" type="doubles" indexed="true" stored="true"/>
147
- <dynamicField name="*_coordinate" type="tdouble" indexed="true" stored="false" />
148
- <dynamicField name="*_dt" type="date" indexed="true" stored="true"/>
149
- <dynamicField name="*_dts" type="date" indexed="true" stored="true" multiValued="true"/>
150
- <dynamicField name="*_p" type="location" indexed="true" stored="true"/>
151
- <dynamicField name="*_ti" type="tint" indexed="true" stored="true"/>
152
- <dynamicField name="*_tis" type="tints" indexed="true" stored="true"/>
153
- <dynamicField name="*_tl" type="tlong" indexed="true" stored="true"/>
154
- <dynamicField name="*_tls" type="tlongs" indexed="true" stored="true"/>
155
- <dynamicField name="*_tf" type="tfloat" indexed="true" stored="true"/>
156
- <dynamicField name="*_tfs" type="tfloats" indexed="true" stored="true"/>
157
- <dynamicField name="*_td" type="tdouble" indexed="true" stored="true"/>
158
- <dynamicField name="*_tds" type="tdoubles" indexed="true" stored="true"/>
159
- <dynamicField name="*_tdt" type="tdate" indexed="true" stored="true"/>
160
- <dynamicField name="*_tdts" type="tdates" indexed="true" stored="true"/>
161
- <dynamicField name="ignored_*" type="ignored" multiValued="true"/>
162
- <dynamicField name="attr_*" type="text_general" indexed="true" stored="true" multiValued="true"/>
163
- <dynamicField name="random_*" type="random" />
164
-
165
- <!--:IMAGE EXIF-->
166
- <field name="exif_location" type="location" indexed="true" stored="true" multiValued="false"/>
167
- <field name="exif_version" type="string" indexed="true" stored="true" multiValued="false"/>
359
+ <!-- Harvest meta-data derived from WARC file names using regexp-rules defined in the warc-indexer config file.
360
+ Primarily used by the Royal Danish Library -->
361
+ <field name="arc_full" type="string" stored="true" docValues="false" />
362
+ <field name="arc_name" type="string" />
363
+ <field name="arc_orig" type="string" />
364
+ <field name="arc_job" type="string" />
365
+ <field name="arc_harvest" type="string" />
366
+ <field name="arc_harvesttime" type="string" />
168
367
 
169
- <!-- BL UKWA: additional -->
170
- <dynamicField name="ssdeep_hash_bs_*" type="string" indexed="true" stored="true" multiValued="false"/>
171
- <dynamicField name="ssdeep_hash_ngram_bs_*" type="literal_ngram" indexed="true" stored="true" multiValued="false"/>
172
- <!--:BL UKWA -->
368
+ <!-- Dynamic fields intended for intstitution-specific fields without changing the schema.
369
+ (yes, the arc_*-fields above should have been dynamic fields instead of hardcoded)
370
+ TODO: Add DocValues-enabled variants (take care not to change existing definitions) -->
371
+ <dynamicField name="*_i" type="int" indexed="true" stored="true" />
372
+ <dynamicField name="*_is" type="int" indexed="true" stored="true" multiValued="true" />
373
+ <dynamicField name="*_s" type="string" indexed="true" stored="true" />
374
+ <dynamicField name="*_ss" type="string" indexed="true" stored="true" multiValued="true" />
375
+ <dynamicField name="*_l" type="long" indexed="true" stored="true" />
376
+ <dynamicField name="*_ls" type="long" indexed="true" stored="true" multiValued="true" />
377
+ <dynamicField name="*_t" type="text_general" indexed="true" stored="true" />
378
+ <dynamicField name="*_txt" type="text_general" indexed="true" stored="true" />
379
+ <dynamicField name="*_b" type="boolean" indexed="true" stored="true" />
380
+ <dynamicField name="*_bs" type="boolean" indexed="true" stored="true" multiValued="true" />
381
+ <dynamicField name="*_f" type="float" indexed="true" stored="true" />
382
+ <dynamicField name="*_fs" type="float" indexed="true" stored="true" multiValued="true" />
383
+ <dynamicField name="*_d" type="double" indexed="true" stored="true" />
384
+ <dynamicField name="*_ds" type="double" indexed="true" stored="true" multiValued="true" />
385
+ <dynamicField name="*_coordinate" type="double" indexed="true" stored="false" />
386
+ <dynamicField name="*_dt" type="date" indexed="true" stored="true" />
387
+ <dynamicField name="*_dts" type="date" indexed="true" stored="true" multiValued="true" />
388
+ <dynamicField name="*_p" type="location" indexed="true" stored="true" />
389
+ <dynamicField name="*_ti" type="int" indexed="true" stored="true" />
390
+ <dynamicField name="*_tis" type="int" indexed="true" stored="true" multiValued="true" />
391
+ <dynamicField name="*_tl" type="long" indexed="true" stored="true" />
392
+ <dynamicField name="*_tls" type="long" indexed="true" stored="true" multiValued="true" />
393
+ <dynamicField name="*_tf" type="float" indexed="true" stored="true" />
394
+ <dynamicField name="*_tfs" type="float" indexed="true" stored="true" multiValued="true" />
395
+ <dynamicField name="*_td" type="double" indexed="true" stored="true" />
396
+ <dynamicField name="*_tds" type="double" indexed="true" stored="true" multiValued="true" />
397
+ <dynamicField name="*_tdt" type="date" indexed="true" stored="true" />
398
+ <dynamicField name="*_tdts" type="date" indexed="true" stored="true" multiValued="true" />
399
+ <dynamicField name="ignored_*" type="ignored" multiValued="true" />
400
+ <dynamicField name="attr_*" type="text_general" indexed="true" stored="true" multiValued="true" />
401
+ <dynamicField name="random_*" type="random" />
402
+
403
+ <dynamicField name="*_ws" type="text_ws" indexed="true" stored="true" />
404
+ <dynamicField name="*_txt_en" type="text_en" indexed="true" stored="true" />
405
+ <dynamicField name="*_txt_en_split" type="text_en_splitting" indexed="true" stored="true" />
406
+ <dynamicField name="*_txt_en_split_tight" type="text_en_splitting_tight" indexed="true" stored="true" />
407
+ <dynamicField name="*_txt_rev" type="text_general_rev" indexed="true" stored="true" />
408
+ <dynamicField name="*_phon_en" type="phonetic_en" indexed="true" stored="true" />
409
+ <dynamicField name="*_s_lower" type="lowercase" indexed="true" stored="true" />
410
+ <dynamicField name="*_descendent_path" type="descendent_path" indexed="true" stored="true" />
411
+ <dynamicField name="*_ancestor_path" type="ancestor_path" indexed="true" stored="true" />
412
+ <dynamicField name="*_point" type="point" indexed="true" stored="true" />
413
+ <dynamicField name="*_txt_ga" type="text_ga" indexed="true" stored="true" />
414
+ </fields>
415
+
416
+ <uniqueKey>id</uniqueKey>
417
+
418
+ <!-- TODO: Remove all copyFields where the source is indexed as text and adjust solrconfig.xml
419
+ to also search in those fields (edismax parser qf) -->
420
+ <copyField source="author" dest="text" />
421
+ <copyField source="keywords" dest="text" />
422
+ <copyField source="wct_title" dest="text" />
423
+ <copyField source="wct_description" dest="text" />
424
+ <copyField source="content" dest="text" />
425
+ <copyField source="url_norm" dest="url_search" />
426
+ <copyField source="resourcename" dest="resourcename_facet"/>
427
+
428
+ <types>
429
+ <!-- Guiding principles:
430
+
431
+ Atomic types are single-valued indexed & docValues, but not stored. This allows for low-cost faceting,
432
+ grouping and sorting. The downside is a performance penalty on document retrieval where a full document
433
+ takes longer to retrieve. Enabling stored speeds up retrieval at the cost of increased index size.
434
+
435
+ Text types are single-valued indexed & stored, but not docValued (DV is not currently possible for Text).
436
+
437
+ Deviations are normally handled by overriding for the specific fields
438
+ -->
439
+
440
+ <fieldType name="string" class="solr.StrField" indexed="true" docValues="true" stored="false" multiValued="false" sortMissingLast="true" />
441
+ <fieldType name="boolean" class="solr.BoolField" indexed="true" docValues="true" stored="false" multiValued="false" sortMissingLast="true" />
442
+ <fieldType name="int" class="solr.IntPointField" indexed="true" docValues="true" stored="false" multiValued="false" sortMissingLast="true" />
443
+ <fieldType name="float" class="solr.FloatPointField" indexed="true" docValues="true" stored="false" multiValued="false" sortMissingLast="true" />
444
+ <fieldType name="long" class="solr.LongPointField" indexed="true" docValues="true" stored="false" multiValued="false" sortMissingLast="true" />
445
+ <fieldType name="double" class="solr.DoublePointField" indexed="true" docValues="true" stored="false" multiValued="false" sortMissingLast="true" />
446
+ <fieldType name="date" class="solr.DatePointField" indexed="true" docValues="true" stored="false" multiValued="false" sortMissingLast="true" />
447
+ <fieldType name="binary" class="solr.BinaryField" indexed="false" docValues="false" stored="true" multiValued="false" />
448
+ <fieldType name="random" class="solr.RandomSortField" />
449
+
450
+ <fieldType name="text_ws" class="solr.TextField" positionIncrementGap="100" indexed="true" stored="true" multiValued="false">
451
+ <analyzer>
452
+ <tokenizer class="solr.WhitespaceTokenizerFactory" />
453
+ </analyzer>
454
+ </fieldType>
173
455
 
174
- <dynamicField name="*_ws" type="text_ws" indexed="true" stored="true"/>
175
- <dynamicField name="*_txt_en" type="text_en" indexed="true" stored="true"/>
176
- <dynamicField name="*_txt_en_split" type="text_en_splitting" indexed="true" stored="true"/>
177
- <dynamicField name="*_txt_en_split_tight" type="text_en_splitting_tight" indexed="true" stored="true"/>
178
- <dynamicField name="*_txt_rev" type="text_general_rev" indexed="true" stored="true"/>
179
- <dynamicField name="*_phon_en" type="phonetic_en" indexed="true" stored="true"/>
180
- <dynamicField name="*_s_lower" type="lowercase" indexed="true" stored="true"/>
181
- <dynamicField name="*_descendent_path" type="descendent_path" indexed="true" stored="true"/>
182
- <dynamicField name="*_ancestor_path" type="ancestor_path" indexed="true" stored="true"/>
183
- <dynamicField name="*_point" type="point" indexed="true" stored="true"/>
184
- <dynamicField name="*_txt_ga" type="text_ga" indexed="true" stored="true"/>
185
- </fields>
186
-
187
- <uniqueKey>id</uniqueKey>
188
-
189
- <copyField source="title" dest="text"/>
190
- <copyField source="author" dest="text"/>
191
- <copyField source="keywords" dest="text"/>
192
- <copyField source="description" dest="text"/>
193
- <copyField source="wct_title" dest="text"/>
194
- <copyField source="wct_description" dest="text"/>
195
- <copyField source="url" dest="text"/>
196
- <copyField source="content" dest="text"/>
197
- <copyField source="resourcename" dest="resourcename_facet"/>
198
-
199
- <types>
200
- <fieldType name="string" class="solr.StrField" sortMissingLast="true" />
201
- <fieldType name="strings" class="solr.StrField" sortMissingLast="true" multiValued="true" docValues="true" />
202
- <fieldType name="boolean" class="solr.BoolField" sortMissingLast="true"/>
203
- <fieldType name="booleans" class="solr.BoolField" sortMissingLast="true" multiValued="true"/>
204
- <fieldType name="int" class="solr.TrieIntField" docValues="true" precisionStep="0" positionIncrementGap="0"/>
205
- <fieldType name="float" class="solr.TrieFloatField" docValues="true" precisionStep="0" positionIncrementGap="0"/>
206
- <fieldType name="long" class="solr.TrieLongField" docValues="true" precisionStep="0" positionIncrementGap="0"/>
207
- <fieldType name="double" class="solr.TrieDoubleField" docValues="true" precisionStep="0" positionIncrementGap="0"/>
208
- <fieldType name="ints" class="solr.TrieIntField" docValues="true" precisionStep="0" positionIncrementGap="0" multiValued="true"/>
209
- <fieldType name="floats" class="solr.TrieFloatField" docValues="true" precisionStep="0" positionIncrementGap="0" multiValued="true"/>
210
- <fieldType name="longs" class="solr.TrieLongField" docValues="true" precisionStep="0" positionIncrementGap="0" multiValued="true"/>
211
- <fieldType name="doubles" class="solr.TrieDoubleField" docValues="true" precisionStep="0" positionIncrementGap="0" multiValued="true"/>
212
- <fieldType name="tint" class="solr.TrieIntField" docValues="true" precisionStep="8" positionIncrementGap="0"/>
213
- <fieldType name="tfloat" class="solr.TrieFloatField" docValues="true" precisionStep="8" positionIncrementGap="0"/>
214
- <fieldType name="tlong" class="solr.TrieLongField" docValues="true" precisionStep="8" positionIncrementGap="0"/>
215
- <fieldType name="tdouble" class="solr.TrieDoubleField" docValues="true" precisionStep="8" positionIncrementGap="0"/>
216
- <fieldType name="tints" class="solr.TrieIntField" docValues="true" precisionStep="8" positionIncrementGap="0" multiValued="true"/>
217
- <fieldType name="tfloats" class="solr.TrieFloatField" docValues="true" precisionStep="8" positionIncrementGap="0" multiValued="true"/>
218
- <fieldType name="tlongs" class="solr.TrieLongField" docValues="true" precisionStep="8" positionIncrementGap="0" multiValued="true"/>
219
- <fieldType name="tdoubles" class="solr.TrieDoubleField" docValues="true" precisionStep="8" positionIncrementGap="0" multiValued="true"/>
220
- <fieldType name="date" class="solr.TrieDateField" docValues="true" precisionStep="0" positionIncrementGap="0"/>
221
- <fieldType name="dates" class="solr.TrieDateField" docValues="true" precisionStep="0" positionIncrementGap="0" multiValued="true"/>
222
- <fieldType name="tdate" class="solr.TrieDateField" docValues="true" precisionStep="6" positionIncrementGap="0"/>
223
- <fieldType name="tdates" class="solr.TrieDateField" docValues="true" precisionStep="6" positionIncrementGap="0" multiValued="true"/>
224
- <fieldType name="binary" class="solr.BinaryField"/>
225
- <fieldType name="random" class="solr.RandomSortField" indexed="true" />
226
-
227
- <fieldType name="text_ws" class="solr.TextField" positionIncrementGap="100">
228
- <analyzer>
229
- <tokenizer class="solr.WhitespaceTokenizerFactory"/>
230
- </analyzer>
456
+ <!-- Used for parsing file paths, so that ["MOO BOO/FooBar_zoo.baz"] becomes ["moo", "boo", "foo", "bar", "zoo", "baz"] -->
457
+ <fieldType name="path" class="solr.TextField" positionIncrementGap="100" indexed="true" stored="true" multiValued="false">
458
+ <analyzer type="index">
459
+ <tokenizer class="solr.StandardTokenizerFactory" />
460
+ <filter class="solr.WordDelimiterFilterFactory" preserveOriginal="0" />
461
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords_path.txt" />
462
+ <filter class="solr.LowerCaseFilterFactory" />
463
+ </analyzer>
464
+ <analyzer type="query">
465
+ <tokenizer class="solr.StandardTokenizerFactory" />
466
+ <filter class="solr.WordDelimiterFilterFactory" preserveOriginal="0" />
467
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords_path.txt" />
468
+ <filter class="solr.LowerCaseFilterFactory" />
469
+ </analyzer>
231
470
  </fieldType>
232
471
 
233
- <fieldType name="text_general" class="solr.TextField" positionIncrementGap="100" multiValued="true">
234
- <analyzer type="index">
235
- <tokenizer class="solr.StandardTokenizerFactory"/>
236
- <filter class="solr.LowerCaseFilterFactory"/>
237
- </analyzer>
238
- <analyzer type="query">
239
- <tokenizer class="solr.StandardTokenizerFactory"/>
240
- <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
241
- <filter class="solr.LowerCaseFilterFactory"/>
242
- </analyzer>
472
+ <fieldType name="text_general" class="solr.TextField" positionIncrementGap="100" indexed="true" stored="true" multiValued="false">
473
+ <analyzer type="index">
474
+ <tokenizer class="solr.StandardTokenizerFactory" />
475
+ <filter class="solr.LowerCaseFilterFactory" />
476
+ </analyzer>
477
+ <analyzer type="query">
478
+ <tokenizer class="solr.StandardTokenizerFactory" />
479
+ <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true" />
480
+ <filter class="solr.LowerCaseFilterFactory" />
481
+ </analyzer>
243
482
  </fieldType>
244
483
 
245
- <fieldType name="text_en" class="solr.TextField" positionIncrementGap="100">
246
- <analyzer type="index">
247
- <tokenizer class="solr.StandardTokenizerFactory"/>
248
- <filter class="solr.LowerCaseFilterFactory"/>
249
- <filter class="solr.EnglishPossessiveFilterFactory"/>
250
- <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
251
- </analyzer>
252
- <analyzer type="query">
253
- <tokenizer class="solr.StandardTokenizerFactory"/>
254
- <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
255
- <filter class="solr.LowerCaseFilterFactory"/>
256
- <filter class="solr.EnglishPossessiveFilterFactory"/>
257
- <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
258
- </analyzer>
484
+ <fieldType name="text_en" class="solr.TextField" positionIncrementGap="100" indexed="true" stored="true" multiValued="false">
485
+ <analyzer type="index">
486
+ <tokenizer class="solr.StandardTokenizerFactory" />
487
+ <filter class="solr.LowerCaseFilterFactory" />
488
+ <filter class="solr.EnglishPossessiveFilterFactory" />
489
+ <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt" />
490
+ </analyzer>
491
+ <analyzer type="query">
492
+ <tokenizer class="solr.StandardTokenizerFactory" />
493
+ <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true" />
494
+ <filter class="solr.LowerCaseFilterFactory" />
495
+ <filter class="solr.EnglishPossessiveFilterFactory" />
496
+ <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt" />
497
+ </analyzer>
259
498
  </fieldType>
260
499
 
261
- <fieldType name="text_en_splitting" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
262
- <analyzer type="index">
263
- <tokenizer class="solr.WhitespaceTokenizerFactory"/>
264
- <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
265
- <filter class="solr.LowerCaseFilterFactory"/>
266
- <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
267
- </analyzer>
268
- <analyzer type="query">
269
- <tokenizer class="solr.WhitespaceTokenizerFactory"/>
270
- <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
271
- <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
272
- <filter class="solr.LowerCaseFilterFactory"/>
273
- <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
274
- </analyzer>
500
+ <fieldType name="text_en_splitting" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true" indexed="true" stored="true" multiValued="false">
501
+ <analyzer type="index">
502
+ <tokenizer class="solr.WhitespaceTokenizerFactory" />
503
+ <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1" />
504
+ <filter class="solr.LowerCaseFilterFactory" />
505
+ <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt" />
506
+ </analyzer>
507
+ <analyzer type="query">
508
+ <tokenizer class="solr.WhitespaceTokenizerFactory" />
509
+ <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true" />
510
+ <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1" />
511
+ <filter class="solr.LowerCaseFilterFactory" />
512
+ <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt" />
513
+ </analyzer>
275
514
  </fieldType>
276
515
 
277
- <fieldType name="text_en_splitting_tight" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
278
- <analyzer>
279
- <tokenizer class="solr.WhitespaceTokenizerFactory"/>
280
- <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
281
- <filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
282
- <filter class="solr.LowerCaseFilterFactory"/>
283
- <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
284
- <filter class="solr.EnglishMinimalStemFilterFactory"/>
285
- <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
286
- </analyzer>
516
+ <fieldType name="text_en_splitting_tight" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true" indexed="true" stored="true" multiValued="false">
517
+ <analyzer>
518
+ <tokenizer class="solr.WhitespaceTokenizerFactory" />
519
+ <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false" />
520
+ <filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0" />
521
+ <filter class="solr.LowerCaseFilterFactory" />
522
+ <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt" />
523
+ <filter class="solr.EnglishMinimalStemFilterFactory" />
524
+ <filter class="solr.RemoveDuplicatesTokenFilterFactory" />
525
+ </analyzer>
287
526
  </fieldType>
288
527
 
289
- <fieldType name="text_general_rev" class="solr.TextField" positionIncrementGap="100">
290
- <analyzer type="index">
291
- <tokenizer class="solr.StandardTokenizerFactory"/>
292
- <filter class="solr.LowerCaseFilterFactory"/>
293
- <filter class="solr.ReversedWildcardFilterFactory" withOriginal="true"
294
- maxPosAsterisk="3" maxPosQuestion="2" maxFractionAsterisk="0.33"/>
295
- </analyzer>
296
- <analyzer type="query">
297
- <tokenizer class="solr.StandardTokenizerFactory"/>
298
- <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
299
- <filter class="solr.LowerCaseFilterFactory"/>
300
- </analyzer>
528
+ <fieldType name="text_general_rev" class="solr.TextField" positionIncrementGap="100" indexed="true" stored="true" multiValued="false">
529
+ <analyzer type="index">
530
+ <tokenizer class="solr.StandardTokenizerFactory" />
531
+ <filter class="solr.LowerCaseFilterFactory" />
532
+ <filter class="solr.ReversedWildcardFilterFactory" withOriginal="true"
533
+ maxPosAsterisk="3" maxPosQuestion="2" maxFractionAsterisk="0.33" />
534
+ </analyzer>
535
+ <analyzer type="query">
536
+ <tokenizer class="solr.StandardTokenizerFactory" />
537
+ <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true" />
538
+ <filter class="solr.LowerCaseFilterFactory" />
539
+ </analyzer>
301
540
  </fieldType>
302
541
 
303
- <fieldType name="phonetic_en" stored="false" indexed="true" class="solr.TextField" >
304
- <analyzer>
305
- <tokenizer class="solr.StandardTokenizerFactory"/>
306
- <filter class="solr.DoubleMetaphoneFilterFactory" inject="false"/>
307
- </analyzer>
542
+ <fieldType name="phonetic_en" class="solr.TextField" indexed="true" stored="true" multiValued="false">
543
+ <analyzer>
544
+ <tokenizer class="solr.StandardTokenizerFactory" />
545
+ <filter class="solr.DoubleMetaphoneFilterFactory" inject="false" />
546
+ </analyzer>
308
547
  </fieldType>
309
548
 
310
- <fieldType name="lowercase" class="solr.TextField" positionIncrementGap="100">
311
- <analyzer>
312
- <tokenizer class="solr.KeywordTokenizerFactory"/>
313
- <filter class="solr.LowerCaseFilterFactory" />
314
- </analyzer>
549
+ <fieldType name="lowercase" class="solr.TextField" positionIncrementGap="100" indexed="true" stored="true" multiValued="false">
550
+ <analyzer>
551
+ <tokenizer class="solr.KeywordTokenizerFactory" />
552
+ <filter class="solr.LowerCaseFilterFactory" />
553
+ </analyzer>
315
554
  </fieldType>
316
555
 
317
556
  <fieldType name="descendent_path" class="solr.TextField">
318
- <analyzer type="index">
319
- <tokenizer class="solr.PathHierarchyTokenizerFactory" delimiter="/" />
320
- </analyzer>
321
- <analyzer type="query">
322
- <tokenizer class="solr.KeywordTokenizerFactory" />
323
- </analyzer>
557
+ <analyzer type="index">
558
+ <tokenizer class="solr.PathHierarchyTokenizerFactory" delimiter="/" />
559
+ </analyzer>
560
+ <analyzer type="query">
561
+ <tokenizer class="solr.KeywordTokenizerFactory" />
562
+ </analyzer>
324
563
  </fieldType>
325
564
 
326
565
  <fieldType name="ancestor_path" class="solr.TextField">
327
- <analyzer type="index">
328
- <tokenizer class="solr.KeywordTokenizerFactory" />
329
- </analyzer>
330
- <analyzer type="query">
331
- <tokenizer class="solr.PathHierarchyTokenizerFactory" delimiter="/" />
332
- </analyzer>
566
+ <analyzer type="index">
567
+ <tokenizer class="solr.KeywordTokenizerFactory" />
568
+ </analyzer>
569
+ <analyzer type="query">
570
+ <tokenizer class="solr.PathHierarchyTokenizerFactory" delimiter="/" />
571
+ </analyzer>
333
572
  </fieldType>
334
573
 
335
574
  <fieldType name="ignored" stored="false" indexed="false" docValues="false" multiValued="true" class="solr.StrField" />
336
- <fieldType name="point" class="solr.PointType" dimension="2" subFieldSuffix="_d"/>
337
- <fieldType name="location" class="solr.LatLonPointSpatialField" docValues="true"/>
575
+ <fieldType name="point" class="solr.PointType" dimension="2" subFieldSuffix="_d" />
576
+ <fieldType name="location" class="solr.LatLonPointSpatialField" indexed="true" stored="false" docValues="true" multiValued="false" />
338
577
 
339
578
  <fieldType name="text_ga" class="solr.TextField" positionIncrementGap="100">
340
- <analyzer>
341
- <tokenizer class="solr.StandardTokenizerFactory"/>
342
- <filter class="solr.ElisionFilterFactory" ignoreCase="true" articles="lang/contractions_ga.txt"/>
343
- <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/hyphenations_ga.txt"/>
344
- <filter class="solr.IrishLowerCaseFilterFactory"/>
345
- </analyzer>
579
+ <analyzer>
580
+ <tokenizer class="solr.StandardTokenizerFactory" />
581
+ <filter class="solr.ElisionFilterFactory" ignoreCase="true" articles="lang/contractions_ga.txt" />
582
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/hyphenations_ga.txt" />
583
+ <filter class="solr.IrishLowerCaseFilterFactory" />
584
+ </analyzer>
346
585
  </fieldType>
347
586
 
348
587
  <!-- BL UKWA: additional -->
349
- <fieldType name="literal_ngram" stored="false" indexed="true" class="solr.TextField">
350
- <analyzer>
351
- <tokenizer class="solr.WhitespaceTokenizerFactory"/>
352
- <filter class="solr.NGramFilterFactory" minGramSize="2" maxGramSize="5"/>
353
- </analyzer>
588
+ <fieldType name="literal_ngram" class="solr.TextField" indexed="true" stored="false" multiValued="false" >
589
+ <analyzer>
590
+ <tokenizer class="solr.WhitespaceTokenizerFactory" />
591
+ <filter class="solr.NGramFilterFactory" minGramSize="2" maxGramSize="5" />
592
+ </analyzer>
354
593
  </fieldType>
355
594
 
356
- <fieldType name="hex_text_shingle" class="solr.TextField" positionIncrementGap="100">
357
- <analyzer>
358
- <tokenizer class="solr.WhitespaceTokenizerFactory"/>
359
- <filter class="solr.ShingleFilterFactory" minShingleSize="4" maxShingleSize="8" outputUnigrams="false" outputUnigramsIfNoShingles="false" tokenSeparator=" "/>
360
- </analyzer>
595
+ <fieldType name="hex_text_shingle" class="solr.TextField" positionIncrementGap="100" indexed="true" stored="true" docValues="false">
596
+ <analyzer>
597
+ <tokenizer class="solr.WhitespaceTokenizerFactory" />
598
+ <filter class="solr.ShingleFilterFactory" minShingleSize="4" maxShingleSize="8" outputUnigrams="false" outputUnigramsIfNoShingles="false" tokenSeparator=" " />
599
+ </analyzer>
361
600
  </fieldType>
362
601
  <!--:BL UKWA -->
363
- </types>
602
+ </types>
364
603
  </schema>