warclight 0.6.3 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/solr/conf/schema.xml CHANGED
@@ -16,349 +16,588 @@
16
16
  limitations under the License.
17
17
  -->
18
18
 
19
+ <!--
20
+ This schema is for Solr 7+ and will not work under Solr 6.
21
+ -->
22
+
19
23
  <schema name="ukwa" version="1.6">
20
- <fields>
21
- <field name="id" type="string" indexed="true" stored="true" required="true" multiValued="false" docValues="true"/>
22
- <field name="_version_" type="long" indexed="true" docValues="true"/>
23
- <field name="_root_" type="string" indexed="true" docValues="true" />
24
- <field name="_text_" type="text_general" indexed="true" stored="false" multiValued="true"/>
24
+ <fields>
25
+ <!-- Solr special purpose meta-fields. Explicit attributes to be sure they are set correctly -->
26
+ <field name="id" type="string" indexed="true" stored="true" docValues="true" required="true" />
27
+ <field name="_version_" type="long" indexed="true" stored="true" docValues="true" />
28
+ <field name="_root_" type="string" indexed="true" stored="true" docValues="true"/>
29
+ <field name="_text_" type="text_general" multiValued="true" /> <!-- Isn't this only used for schema-less? -->
25
30
 
26
- <!--Not defined in SolrFields. Schema only defintion -->
27
- <field name="index_time" type="date" indexed="true" stored="false" docValues="true" default="NOW" />
31
+ <!-- The time of document indexing. Set automatically by Solr.
32
+ Sample use: Freezing a query result even when new documents are added to the index:
33
+ q=foo&fq=index_time:[* TO 2018-05-16T10:33:00Z]
34
+ Sample use: Discover new documents added since last check for new documents:
35
+ q=*:*&fq=index_time:[2018-05-16T10:33:00Z TO *] -->
36
+ <field name="index_time" type="date" default="NOW" />
28
37
 
29
- <!-- BL UKWA: additional -->
30
- <field name="access_terms" type="string" indexed="true" docValues="true" multiValued="true"/>
31
- <field name="author" type="string" indexed="true" docValues="true" multiValued="true"/>
32
- <field name="category" type="text_general" indexed="true" stored="true"/>
33
- <field name="collection" type="string" indexed="true" docValues="true" multiValued="true"/>
34
- <field name="collections" type="string" indexed="true" docValues="true" multiValued="true"/>
35
- <field name="comments" type="text_general" indexed="true" stored="true"/>
36
- <field name="content_encoding" type="string" indexed="true" docValues="true" multiValued="false"/>
37
- <field name="content_ffb" type="string" indexed="true" docValues="true" multiValued="false"/>
38
- <field name="content_first_bytes" type="hex_text_shingle" indexed="true" stored="true" multiValued="false"/>
39
- <field name="content_language" type="string" indexed="true" docValues="true" multiValued="false"/>
40
- <field name="content_length" type="tint" indexed="true" stored="false" multiValued="false" docValues="true"/>
41
- <field name="content_metadata_ss" type="string" indexed="true" docValues="true" multiValued="true"/>
42
- <field name="content_metadata" type="text_general" indexed="true" stored="true" multiValued="false"/>
43
- <field name="content_text_length" type="tint" indexed="true" stored="false" multiValued="false" docValues="true"/>
44
- <field name="content_type_droid" type="string" indexed="true" docValues="true" multiValued="false"/>
45
- <field name="content_type_ext" type="string" indexed="true" docValues="true" multiValued="false"/>
46
- <field name="content_type_full" type="string" indexed="true" docValues="true" multiValued="false"/>
47
- <field name="content_type_norm" type="string" indexed="true" docValues="true" multiValued="false" default="other"/>
48
- <field name="content_type_served" type="string" indexed="true" docValues="true" multiValued="false"/>
49
- <field name="content" type="text_general" indexed="true" stored="true" multiValued="true"/>
50
- <field name="content_type_tika" type="string" indexed="true" docValues="true" multiValued="false"/>
51
- <field name="content_type" type="string" indexed="true" docValues="true" multiValued="true"/>
52
- <field name="content_type_version" type="string" indexed="true" docValues="true" multiValued="false"/>
53
- <field name="crawl_dates" type="tdate" indexed="true" stored="true" multiValued="true"/>
54
- <field name="crawl_date" type="tdate" indexed="true" stored="false" multiValued="false" docValues="true"/>
55
- <field name="crawl_year_month_day" type="int" indexed="true" docValues="true" multiValued="false"/>
56
- <field name="crawl_year_month" type="int" indexed="true" docValues="true" multiValued="false"/>
57
- <field name="crawl_years" type="int" indexed="true" docValues="true" multiValued="true"/>
58
- <field name="crawl_year" type="int" indexed="true" docValues="true" multiValued="false"/>
59
- <field name="description" type="text_general" indexed="true" stored="true"/>
60
- <field name="domain" type="string" indexed="true" docValues="true" multiValued="false"/>
61
- <field name="elements_used" type="string" indexed="true" docValues="true" multiValued="true"/>
62
- <field name="generator" type="string" indexed="true" docValues="true" multiValued="true"/>
63
- <field name="hash" type="string" indexed="true" docValues="true" multiValued="false"/>
64
- <field name="hashes" type="string" indexed="true" docValues="true" multiValued="true"/>
65
- <field name="host" type="string" indexed="true" docValues="true" multiValued="false"/>
66
- <field name="host_surt" type="string" indexed="true" docValues="true" multiValued="true"/>
67
- <field name="id_long" type="long" indexed="true" stored="true" multiValued="false"/>
68
- <field name="image_colours" type="string" indexed="true" stored="true" multiValued="true"/>
69
- <field name="image_dominant_colour" type="string" indexed="true" stored="true" multiValued="false"/>
70
- <field name="image_faces_count" type="tint" indexed="true" stored="true" multiValued="false"/>
71
- <field name="image_faces" type="string" indexed="false" stored="true" multiValued="true"/>
72
- <field name="image_height" type="tlong" indexed="true" stored="true" multiValued="false"/>
73
- <field name="image_size" type="tlong" indexed="true" stored="true" multiValued="false"/>
74
- <field name="image_width" type="tlong" indexed="true" stored="true" multiValued="false"/>
75
- <field name="keywords" type="text_general" indexed="true" stored="true"/>
76
- <field name="last_modified" type="tdate" indexed="true" stored="true" docValues="true"/>
77
- <field name="last_modified_year" type="string" indexed="true" docValues="true"/>
78
- <field name="license_url" type="string" indexed="true" docValues="true" multiValued="true"/>
79
- <field name="links_images" type="text_general" indexed="true" stored="true" multiValued="true"/>
80
- <field name="links_domains" type="string" indexed="true" docValues="true" multiValued="true"/>
81
- <field name="links_hosts" type="string" indexed="true" docValues="true" multiValued="true"/>
82
- <field name="links_hosts_surts" type="string" indexed="true" docValues="true" multiValued="true"/>
83
- <field name="links_norm" type="string" indexed="true" docValues="true" multiValued="true"/>
84
- <field name="links_public_suffixes" type="string" indexed="true" docValues="true" multiValued="true"/>
85
- <field name="links" type="string" indexed="true" docValues="true" multiValued="true"/>
86
- <field name="locations" type="location" indexed="true" stored="true" multiValued="true"/>
87
- <field name="parse_error" type="string" indexed="true" docValues="true" multiValued="true"/>
88
- <field name="pdf_pdfa_errors" type="text_general" indexed="false" stored="true" multiValued="true"/>
89
- <field name="pdf_pdfa_is_valid" type="string" indexed="true" docValues="true" multiValued="false"/>
90
- <field name="postcode_district" type="string" indexed="true" docValues="true" multiValued="true"/>
91
- <field name="postcode" type="string" indexed="true" docValues="true" multiValued="true"/>
92
- <field name="publication_date" type="tdate" indexed="true" stored="true" multiValued="false"/>
93
- <field name="publication_year" type="string" indexed="true" docValues="true" multiValued="false"/>
94
- <field name="public_suffix" type="string" indexed="true" docValues="true" multiValued="false"/>
95
- <field name="record_type" type="string" indexed="true" stored="true" multiValued="false" docValues="true"/>
96
- <field name="redirect_to_norm" type="string" indexed="true" stored="false" docValues="true" multiValued="false"/>
97
- <field name="referrer_url" type="string" indexed="true" stored="true" multiValued="false" docValues="true"/>
98
- <field name="resourcename" type="text_general" indexed="true" stored="true"/>
99
- <field name="resourcename_facet" type="string" indexed="true" multiValued="false" docValues="true"/>
100
- <field name="sentiment_score" type="float" indexed="true" stored="true" multiValued="false"/>
101
- <field name="sentiment" type="string" indexed="true" docValues="true" multiValued="false"/>
102
- <field name="server" type="string" indexed="true" docValues="true" multiValued="true"/>
103
- <field name="source_file_path" type="string" indexed="true" docValues="true" />
104
- <field name="source_file_offset" type="tlong" indexed="true" stored="true" />
105
- <field name="source_file" type="string" indexed="true" docValues="true" />
106
- <field name="status_code" type="int" indexed="true" stored="true" docValues="true" />
107
- <field name="subject" type="text_general" indexed="true" stored="true" multiValued="true"/>
108
- <field name="text" type="text_general" indexed="true" stored="false" multiValued="true"/>
109
- <field name="title" type="text_general" indexed="true" stored="true" multiValued="false"/>
110
- <field name="type" type="string" indexed="true" docValues="true" multiValued="false"/>
111
- <field name="url_norm" type="string" indexed="true" stored="false" docValues="true" multiValued="false"/>
112
- <field name="url_path" type="string" indexed="true" stored="false" docValues="true" multiValued="false"/>
113
- <field name="url" type="string" indexed="true" stored="false" docValues="true" multiValued="false"/>
114
- <field name="url_type" type="text_general" indexed="true" stored="true"/>
115
- <field name="wayback_date" type="long" indexed="false" stored="true" docValues="false" multiValued="false"/>
116
- <field name="wct_agency" type="string" indexed="true" docValues="true" multiValued="false"/>
117
- <field name="wct_collections" type="string" indexed="true" docValues="true" multiValued="true"/>
118
- <field name="wct_description" type="text_general" indexed="true" stored="true"/>
119
- <field name="wct_instance_id" type="int" indexed="true" stored="true" multiValued="false"/>
120
- <field name="wct_subjects" type="string" indexed="true" docValues="true" multiValued="true"/>
121
- <field name="wct_target_id" type="string" indexed="true" docValues="true" multiValued="false"/>
122
- <field name="wct_title" type="string" indexed="true" docValues="true"/>
123
- <field name="xml_root_ns" type="string" indexed="true" docValues="true" multiValued="false"/>
124
- <field name="warc_key_id" type="string" indexed="true" docValues="true" multiValued="false"/>
125
- <field name="warc_ip" type="string" indexed="true" docValues="true" multiValued="false"/>
126
- <!--:BL UKWA -->
38
+ <!-- BL UKWA: Access flag (i.e. Open Access or not) -->
39
+ <field name="access_terms" type="string" multiValued="true" />
40
+
41
+ <!-- Author extracted from HTML meta-fields, Word documents meta data, image Exif etc.
42
+ Search directly in the author-field is verbatim and thus not very usable for user-defines queries.
43
+ Sample use: Faceting with facet=true&facet.field=author -->
44
+ <field name="author" type="string" />
45
+
46
+ <!-- Does not seem to be used as of 20180516 -->
47
+ <field name="category" type="text_general" />
48
+
49
+ <!-- Institution-specific collection names. Can be specified when calling the indexer -->
50
+ <field name="collection" type="string" multiValued="true" /> <!-- Why is this multi-valued? -->
51
+ <field name="collections" type="string" multiValued="true" />
52
+
53
+ <!-- Does not seem to be used as of 20180516 -->
54
+ <field name="comments" type="text_general" multiValued="true" />
55
+
56
+ <!-- Dublin Core description tag from HTML pages -->
57
+ <field name="description" type="text_general" />
58
+
59
+ <!-- hashtags and other keywords -->
60
+ <field name="keywords" type="text_general" multiValued="true" />
61
+
62
+ <!-- Licence URL as specified on HTML pages using links with rel=license -->
63
+ <field name="license_url" type="string" multiValued="true" />
64
+
65
+ <!-- The core content of the resource (all text with tags stripped from HTML pages, the text in a Word document)
66
+ Note: This field not searchable. Use 'text' for search.
67
+ Sample use: Highlighting with q=floodgate&hl=true&hl.field=content -->
68
+ <field name="content" type="text_general" indexed="false" />
69
+
70
+ <!-- The original encoding of the content (UTF-8/ISO-8859-1/Windows-1250...)
71
+ Note: Irregardless of the original encoding, content is always converted to UTF-8 in the Solr document -->
72
+ <field name="content_encoding" type="string" />
73
+
74
+ <!-- The first 4 bytes of the content, represented as lower-case hex with no space -->
75
+ <field name="content_ffb" type="string" />
76
+
77
+ <!-- The first 32 bytes of the content, represented as shingled space-separated lower-case hex.
78
+ Sample use: Locate sub-sequences of bytes within the first 32 bytes (signature search):
79
+ content_first_bytes:"89 50 4e 47" locates content which is probably PNG -->
80
+ <field name="content_first_bytes" type="hex_text_shingle" />
81
+
82
+ <!-- Language as detected by Tika.
83
+ Sample use: Faceting on language with facet=true&facet.field=content_language -->
84
+ <field name="content_language" type="string" />
85
+
86
+ <!-- The content length measured in bytes.
87
+ Sample use: Sort by content size with sort=content_length desc
88
+ Sample use: Size statistics for the full result set: stats=true&stats.field=content_length -->
89
+ <field name="content_length" type="int" />
90
+ <!-- <field name="content_metadata_ss" type="string" multiValued="true" />--> <!-- Not used for anything -->
91
+
92
+ <!-- If warc.index.tika.extract_all_metadata was enabled during indexing, Tika metadata is added here. -->
93
+ <field name="content_metadata" type="text_general" />
94
+
95
+ <!-- The content length measured in characters. Mostly relevant for text-based formats (html, doc, pdf...).
96
+ Sample use: Sort by text length with sort=content_text_length desc
97
+ Sample use: Size statistics for the full result set: stats=true&stats.field=content_text_length -->
98
+ <field name="content_text_length" type="int" />
99
+
100
+ <!-- The MIME content type as determined by DROID -->
101
+ <field name="content_type_droid" type="string" />
102
+ <!-- The file extension: my.sample.png will yield 'png' -->
103
+ <field name="content_type_ext" type="string" />
104
+ <!-- Best-guess MIME-type for the content, based on droid, Tika, WARC-header, HTTP-header and
105
+ webarchive-discovery processing -->
106
+ <field name="content_type_full" type="string" />
107
+ <!-- Content type represented as low-cardinality human-readable text: image, video, text etc. -->
108
+ <field name="content_type_norm" type="string" default="other" />
109
+ <!-- The MIME content type as specified by the web server the resource was harvested from -->
110
+ <field name="content_type_served" type="string" />
111
+ <!-- The MIME content type as determined by Tika -->
112
+ <field name="content_type_tika" type="string" />
113
+ <!-- Not clear what this is. TODO: Determine what it is -->
114
+ <field name="content_type" type="string" /> <!-- Used to be multi-valued -->
115
+ <!-- The version for the MIME type, if available -->
116
+ <field name="content_type_version" type="string" />
117
+
118
+ <!-- The HTML elements used if the resource is a HTML page -->
119
+ <field name="elements_used" type="string" multiValued="true" />
120
+ <!-- Hash of the content (SHA1) -->
121
+ <field name="hash" type="string" />
122
+
123
+ <!-- Does not seem to be used as of 20180516 -->
124
+ <field name="hashes" type="string" multiValued="true" />
125
+ <!-- Does not seem to be used as of 20180516 -->
126
+ <field name="id_long" type="long" />
127
+
128
+ <!-- The date represented as a long in the form of YYYYmmddHHMMSS, which is compatible with Wayback.
129
+ The field is not searchable. Use crawl_date for search and general processing -->
130
+ <field name="wayback_date" type="long" indexed="false" stored="true" docValues="false" />
131
+ <!-- If webarchive-discovery runs in update-mode, multiple harvests of the same URL will be collapsed to
132
+ a single document and the dates from the different harvests will be added to this field -->
133
+ <field name="crawl_dates" type="date" stored="true" docValues="false" multiValued="true" />
134
+ <!-- The crawl-date as specified in the WARC.
135
+ Sample use: Faceting by date with
136
+ facet=true&facet.range=crawl_date&facet.range.start=2010-01-01T00:00:00Z&facet.range.end=2019-01-01T00:00:00Z&facet.range.gap=+1MONTH&facet.range.method=dv
137
+ Sample use: Sorting newest material first: sort=crawl_date desc
138
+ -->
139
+ <field name="crawl_date" type="date" />
140
+ <!-- month_day & day not used for anything -->
141
+ <!-- <field name="crawl_year_month_day" type="int" />
142
+ <field name="crawl_year_month" type="int" />-->
143
+ <!-- If webarchive-discovery runs in update-mode, multiple harvests of the same URL will be collapsed to
144
+ a single document and the years from the dates from the different harvests will be added to this field -->
145
+ <field name="crawl_years" type="int" multiValued="true" />
146
+ <!-- The year extracted from crawl_date. Faster than crawl_date if used for faceting.
147
+ Sample use: Faceting by year with facet.field=crawl_year&facet.sort=index&facet=true -->
148
+ <field name="crawl_year" type="int" />
149
+ <!-- Last modified timestamp extracted from the resource. Sources such as JPEG images, PDF files and Word
150
+ document often has this.
151
+ Note: This is not a very reliable timestamp for most formats. JPEGs tend to work quite well.
152
+ Sample use: Sorting by age as stated in the format sort=last_modified asc -->
153
+ <field name="last_modified" type="date" />
154
+ <!-- The year from last_modified -->
155
+ <field name="last_modified_year" type="string" /> <!-- Why is this a string? -->
156
+
157
+ <!-- Heavily normalised URL: http/https is collepsed to http, everything is lowercased, trailing / are removed
158
+ for all URLs, except those pointing to root, e.g. "http://example.com/". There is more processing than
159
+ that. If the field is to be queried with a user-provided URL, it is highly recommended to use the method
160
+ Normalisation.canonicaliseURL() from webarchive-discovery to ensure match.
161
+ This field matches normalisation with the links-field, making it possible to perform graph traversals.
162
+ Note: This field has very high cardinality (a little less than the number of documents in the index).
163
+ Faceting should be done with care and is likely to fail with an OutOfMemoryException on a large index
164
+ using stock Solr. This problem is being worked on by Toke Eskildsen - toes@kb.dk -->
165
+ <field name="url_norm" type="string" />
166
+ <!-- Variation of url_norm intended for search for partial URLs.
167
+ Sample use: Search for large images with q=url_search:"images/large" -->
168
+ <field name="url_search" type="path" stored="false" /> <!-- search only to save space-->
169
+ <!-- Path-only for the URL: http://example.com/foo/bar.png becomes /foo/bar.png -->
170
+ <field name="url_path" type="string" />
171
+ <!-- Original URL, as specified in the WARC header. Not analysed and thus likely to give false negatives
172
+ if searched directly with user-input. Consider using url_norm for searching.
173
+ Note: This field has very high cardinality (a little less than the number of documents in the index).
174
+ Faceting should be done with care and is likely to fail with an OutOfMemoryException on a large index
175
+ using stock Solr. This problem is being worked on by Toke Eskildsen - toes@kb.dk -->
176
+ <field name="url" type="string" />
177
+ <!-- Possible values: normal, robots.txt and slashpage (root page for the domain).
178
+ Sample use: Search only for root pages with q=url_type:slashpage -->
179
+ <field name="url_type" type="string" />
180
+ <!-- The domain from the URL. The domain is the short name, registered by the domain owner.
181
+ This can be coupled with the links_domains field for building graphs.
182
+ Sample use: Faceting to show most popular domains with facet=true&facet.field=domain -->
183
+ <field name="domain" type="string" />
184
+ <!-- The host from the URL. The ending of the host is always the same as the domain, with optional prefix,
185
+ e.g. a host can be foo.bar.zoo.example.com or just example.com for the domain example.com.
186
+ Sample use: Faceting to show most popular hosts with facet=true&facet.field=host -->
187
+ <field name="host" type="string" />
188
+ <!-- The host from the URL in SURT'ed form: http://webarchivingbucket.com/techblog/?p=48
189
+ e.g. foo.bar.dk becomes the three values ["(dk,", "(dk,bar,", "(dk,bar,foo"]. -->
190
+ <field name="host_surt" type="string" multiValued="true" />
191
+ <!-- The part below the domain in the URL. For all dk-domains it will be dk. For domains such
192
+ as myname.blogspot.com and mycompany.co.uk it will be blogspot.com and co.uk. -->
193
+ <field name="public_suffix" type="string" />
194
+ <!-- The last part of the URL, typically a filename, e.g. giant_rabbitFoot.png. The field is analysed with an
195
+ aggressive tokenizer, so that giant_rabbitFoot.png is split into [giant, rabbit, foot, png] and searches
196
+ are not dependent on knowing file extensions etc.
197
+ See resourcename_facet for sorting, grouping and faceting.
198
+ Sample use: Search for images of kittens: q=resourcename:kittens&fq=content_type_norm:image -->
199
+ <field name="resourcename" type="path" />
200
+ <!-- Mirror of resourcename intended for sorting, grouping and faceting.
201
+ Important note: This is a high-cardinality field. Faceting on a web archive with billions of records
202
+ will likely lead to memory problems. This problem is being worked on by Toke Eskildsen - toes@kb.dk -->
203
+ <field name="resourcename_facet" type="string" />
204
+
205
+ <!-- Does not seem to be used as of 20180516 -->
206
+ <field name="image_colours" type="string" multiValued="true" />
207
+ <!-- If warc.index.extract.content.images.dominantColours was true during indexing, this field will contain
208
+ the dominant colour if the resource is an image. The colour is a human readable name, e.g. crimson,
209
+ icory or goldenrod, as defined by https://www.w3.org/TR/SVG/types.html#ColorKeywords -->
210
+ <field name="image_dominant_colour" type="string" />
211
+ <!-- If warc.index.extract.content.images.detectFaces was true during indexing, this will contain the number
212
+ of faces detected if the resource is an image.
213
+ Note: Face recognition is heavy and it is recommended not to enable it unless the need is high -->
214
+ <field name="image_faces_count" type="int" />
215
+ <!-- If warc.index.extract.content.images.detectFaces was true during indexing, this will contain the faces
216
+ detected if the resource is an image. A face is represented by a bounding box relative to the original
217
+ image.
218
+ Note: Face recognition is heavy and it is recommended not to enable it unless the need is high -->
219
+ <field name="image_faces" type="string" indexed="false" stored="true" docValues="false" multiValued="true" />
220
+ <!-- Image height in pixels.
221
+ Sample use: Get statistics for image height with stats=true&stats.field=image_height -->
222
+ <field name="image_height" type="long" />
223
+ <!-- Image size in pixels (width*height).
224
+ Sample use: Get statistics for image size with stats=true&stats.field=image_size
225
+ Sample use: Locate largest images with sort=image_size desc -->
226
+ <field name="image_size" type="long" />
227
+ <!-- Image width in pixels.
228
+ Sample use: Get statistics for image width with stats=true&stats.field=image_width -->
229
+ <field name="image_width" type="long" />
230
+
231
+ <!-- Links to images shown on a given web page (aka embedded images).
232
+ Normalised the same way as url_norm -->
233
+ <field name="links_images" type="string" multiValued="true" />
234
+ <!-- domains from outgoing links for a HTML page -->
235
+ <field name="links_domains" type="string" multiValued="true" />
236
+ <!-- hosts from outgoing links for a HTML page -->
237
+ <field name="links_hosts" type="string" multiValued="true" />
238
+ <!-- SORT'ed form of hosts (see the host_surt field) from outgoing links for a HTML page -->
239
+ <field name="links_hosts_surts" type="string" multiValued="true" />
240
+ <!-- Might be used in the future but will take up a lot of space (same as 'links') -->
241
+ <!-- <field name="links_norm" type="string" multiValued="true" />-->
242
+ <!-- public suffixes (see public_suffix field) from outgoing links for a HTML page -->
243
+ <field name="links_public_suffixes" type="string" multiValued="true" />
244
+ <!-- Links to external (i.e. not images and other embedded content).
245
+ Normalised the same way as url_norm
246
+ Note: This field has extremely high cardinality (10 times the number of documents in the index).
247
+ Faceting should be done with care and is highly likely to fail with an OutOfMemoryException even on a
248
+ medium sized index using stock Solr. This problem is being worked on by Toke Eskildsen - toes@kb.dk -->
249
+ <field name="links" type="string" multiValued="true" />
250
+
251
+ <!-- Geographical coordinates, extracted from postcodes.
252
+ Sample use: Find images taken within a given radius for from a given geo location with Solr geodist search
253
+ q=({!geofilt sfield=exif_location}) AND *:*&pt=56.17,10.20&d=0.8
254
+ where the coordinates can be retrieved from e.g. Google Maps, where they are shown at the bottom of
255
+ the map when a location is clicked. d is distance in kilometers.
256
+ See https://lucene.apache.org/solr/guide/7_3/spatial-search.html for details -->
257
+ <field name="locations" type="location" multiValued="true" />
258
+
259
+ <!-- Non-fatal errors during mete data extraction as part of indexing -->
260
+ <field name="parse_error" type="string" multiValued="true" />
261
+ <!-- If warc.index.extract.content.extractApachePreflightErrors is true during indexing, this field will
262
+ contain errors encountered during PDF/A-validation -->
263
+ <field name="pdf_pdfa_errors" type="string" multiValued="true" />
264
+ <!-- If warc.index.extract.content.extractApachePreflightErrors is true during indexing, this field will
265
+ be true id the resource was a PDF and a valid PDF/A.
266
+ Note: PDF validation is heavy and it is recommended not to enable it unless the need is high-->
267
+ <field name="pdf_pdfa_is_valid" type="string" />
268
+ <!-- UK postcodes only, as they are easily recognizable -->
269
+ <field name="postcode_district" type="string" multiValued="true" />
270
+ <!-- UK postcodes only, as they are easily recognizable -->
271
+ <field name="postcode" type="string" multiValued="true" />
272
+
273
+ <!-- Does not seem to be used as of 20180516 -->
274
+ <field name="publication_date" type="date" />
275
+ <!-- Does not seem to be used as of 20180516 -->
276
+ <field name="publication_year" type="string" />
277
+ <!-- The source format. Currently arc or warc. This might be extended in the future -->
278
+ <field name="record_type" type="string" />
279
+ <!-- If warc.index.extract.content.text_sentimentj was true during indexing, this field will contain a
280
+ numeric score for the sentiment, with 0.0 being "very negative" and high values being "very positive" -->
281
+ <field name="sentiment_score" type="float" />
282
+ <!-- If warc.index.extract.content.text_sentimentj was true during indexing, this field will contain a
283
+ human readable assessment of the sentiment, from "very negative" to "very positive" -->
284
+ <field name="sentiment" type="string" />
285
+
286
+ <!-- The HTTP server as stated in the HTTP-headers -->
287
+ <field name="server" type="string" multiValued="true" />
288
+ <!-- Status-code for the resource, as stated in the HTTP-headers from the originating web server -->
289
+ <field name="status_code" type="int" />
290
+ <!-- The generator for the resource, e.g. Wordpress or Photoshop -->
291
+ <field name="generator" type="string" multiValued="true" />
292
+ <!-- Does not seem to be used as of 20180516 -->
293
+ <field name="referrer_url" type="string" />
294
+ <!-- If the resource is returned with a 3xx HTTP response code, it is a redirection. This field contains
295
+ the URL that the resource redirects to, normalised like url_norm -->
296
+ <field name="redirect_to_norm" type="string" />
297
+
298
+ <!-- The full path of the origin container (typically WARC) for the harvested resource, e.g.
299
+ /harvests/full/2018-05/myharvest_20180516_1706.warc.gz
300
+ Sample use: Delivery of the raw resource by seeking to the source_file_offset in source_file and
301
+ reading the resource bytes (the size of the resource is stated in the headers in the source_file) -->
302
+ <field name="source_file_path" type="string" />
303
+ <!-- The offset for the resource within the source_file (aka WARC).
304
+ Sample use: Delivery of the raw resource by seeking to the source_file_offset in source_file and
305
+ reading the resource bytes (the size of the resource is stated in the headers in the source_file) -->
306
+ <field name="source_file_offset" type="long" /> <!-- docValues as it will probably be used for streaming export -->
307
+ <!-- The file name of the origin container (typically WARC) for the harvested resource, e.g.
308
+ myharvest_20180516_1706.warc.gz
309
+ Sample use: Delivery of the raw resource by seeking to the source_file_offset in source_file and
310
+ reading the resource bytes (the size of the resource is stated in the headers in the source_file) -->
311
+ <field name="source_file" type="string" />
312
+
313
+ <!-- Catch-all search field. All text content is copied here. -->
314
+ <field name="text" type="text_general" stored="false" multiValued="true" /> <!-- Catch-all -->
315
+ <!-- HTML page <title>, Word document title, Dublic Core title, etc -->
316
+ <field name="title" type="text_general" />
317
+ <!-- Variant of content_type_norm with human readable designations for the content type -->
318
+ <field name="type" type="string" />
319
+
320
+ <!-- Meta data from Web Curator Tool -->
321
+ <field name="wct_agency" type="string" />
322
+ <field name="wct_collections" type="string" multiValued="true" />
323
+ <field name="wct_description" type="text_general" />
324
+ <field name="wct_instance_id" type="int" indexed="true" stored="true" docValues="false" />
325
+ <field name="wct_subjects" type="string" multiValued="true" />
326
+ <field name="wct_target_id" type="string" />
327
+ <field name="wct_title" type="string" />
328
+
329
+ <!-- Root namespace for XML files.
330
+ Sample use: Facet to get most popular XML formats with facet=true&facet.field=xml_root_ns -->
331
+ <field name="xml_root_ns" type="string" />
332
+ <!-- WARC-Record-ID if available -->
333
+ <field name="warc_key_id" type="string" />
334
+ <!-- WARC-IP-Address if available -->
335
+ <field name="warc_ip" type="string" />
336
+
337
+ <!-- Geographical coordinates, extracted from image Exif data.
338
+ Sample use: Find images taken within a given radius from a given geo location with Solr geodist search
339
+ q=({!geofilt sfield=exif_location}) AND *:*&pt=56.17,10.20&d=0.8
340
+ where the coordinates can be retrieved from e.g. Google Maps, where they are shown at the bottom of
341
+ the map when a location is clicked. d is distance in kilometers.
342
+ See https://lucene.apache.org/solr/guide/7_3/spatial-search.html for details -->
343
+ <field name="exif_location" type="location" />
344
+ <!-- The Exif version (Exchangeable image file format) -->
345
+ <field name="exif_version" type="string" stored="true" docValues="false" />
346
+
347
+ <!-- Fuzzy matching on text for similarity search.
348
+ If warc.index.extract.content.text_fuzzy_hash is true during indexing, fields for SSDeep hashes will
349
+ be created. See https://ssdeep-project.github.io/ssdeep/ for details -->
350
+ <dynamicField name="ssdeep_hash_bs_*" type="string" stored="true" docValues="false" />
351
+ <!-- Does not seem to be used as of 20180517 -->
352
+ <dynamicField name="ssdeep_hash_ngram_bs_*" type="literal_ngram" stored="true" />
127
353
 
128
354
  <!-- User supplied Archive-It fields: -->
129
- <field name="institution" type="string" indexed="true" multiValued="false" docValues="true"/>
130
- <field name="collection_id" type="string" indexed="true" multiValued="false" docValues="true"/>
355
+ <field name="institution" type="string" />
356
+ <field name="collection_id" type="string" />
131
357
  <!--:User supplied Archive-It fields -->
132
358
 
133
- <dynamicField name="*_i" type="int" indexed="true" stored="true"/>
134
- <dynamicField name="*_is" type="ints" indexed="true" stored="true"/>
135
- <dynamicField name="*_s" type="string" indexed="true" stored="true" />
136
- <dynamicField name="*_ss" type="strings" indexed="true" stored="true"/>
137
- <dynamicField name="*_l" type="long" indexed="true" stored="true"/>
138
- <dynamicField name="*_ls" type="longs" indexed="true" stored="true"/>
139
- <dynamicField name="*_t" type="text_general" indexed="true" stored="true"/>
140
- <dynamicField name="*_txt" type="text_general" indexed="true" stored="true"/>
141
- <dynamicField name="*_b" type="boolean" indexed="true" stored="true"/>
142
- <dynamicField name="*_bs" type="booleans" indexed="true" stored="true"/>
143
- <dynamicField name="*_f" type="float" indexed="true" stored="true"/>
144
- <dynamicField name="*_fs" type="floats" indexed="true" stored="true"/>
145
- <dynamicField name="*_d" type="double" indexed="true" stored="true"/>
146
- <dynamicField name="*_ds" type="doubles" indexed="true" stored="true"/>
147
- <dynamicField name="*_coordinate" type="tdouble" indexed="true" stored="false" />
148
- <dynamicField name="*_dt" type="date" indexed="true" stored="true"/>
149
- <dynamicField name="*_dts" type="date" indexed="true" stored="true" multiValued="true"/>
150
- <dynamicField name="*_p" type="location" indexed="true" stored="true"/>
151
- <dynamicField name="*_ti" type="tint" indexed="true" stored="true"/>
152
- <dynamicField name="*_tis" type="tints" indexed="true" stored="true"/>
153
- <dynamicField name="*_tl" type="tlong" indexed="true" stored="true"/>
154
- <dynamicField name="*_tls" type="tlongs" indexed="true" stored="true"/>
155
- <dynamicField name="*_tf" type="tfloat" indexed="true" stored="true"/>
156
- <dynamicField name="*_tfs" type="tfloats" indexed="true" stored="true"/>
157
- <dynamicField name="*_td" type="tdouble" indexed="true" stored="true"/>
158
- <dynamicField name="*_tds" type="tdoubles" indexed="true" stored="true"/>
159
- <dynamicField name="*_tdt" type="tdate" indexed="true" stored="true"/>
160
- <dynamicField name="*_tdts" type="tdates" indexed="true" stored="true"/>
161
- <dynamicField name="ignored_*" type="ignored" multiValued="true"/>
162
- <dynamicField name="attr_*" type="text_general" indexed="true" stored="true" multiValued="true"/>
163
- <dynamicField name="random_*" type="random" />
164
-
165
- <!--:IMAGE EXIF-->
166
- <field name="exif_location" type="location" indexed="true" stored="true" multiValued="false"/>
167
- <field name="exif_version" type="string" indexed="true" stored="true" multiValued="false"/>
359
+ <!-- Harvest meta-data derived from WARC file names using regexp-rules defined in the warc-indexer config file.
360
+ Primarily used by the Royal Danish Library -->
361
+ <field name="arc_full" type="string" stored="true" docValues="false" />
362
+ <field name="arc_name" type="string" />
363
+ <field name="arc_orig" type="string" />
364
+ <field name="arc_job" type="string" />
365
+ <field name="arc_harvest" type="string" />
366
+ <field name="arc_harvesttime" type="string" />
168
367
 
169
- <!-- BL UKWA: additional -->
170
- <dynamicField name="ssdeep_hash_bs_*" type="string" indexed="true" stored="true" multiValued="false"/>
171
- <dynamicField name="ssdeep_hash_ngram_bs_*" type="literal_ngram" indexed="true" stored="true" multiValued="false"/>
172
- <!--:BL UKWA -->
368
+ <!-- Dynamic fields intended for intstitution-specific fields without changing the schema.
369
+ (yes, the arc_*-fields above should have been dynamic fields instead of hardcoded)
370
+ TODO: Add DocValues-enabled variants (take care not to change existing definitions) -->
371
+ <dynamicField name="*_i" type="int" indexed="true" stored="true" />
372
+ <dynamicField name="*_is" type="int" indexed="true" stored="true" multiValued="true" />
373
+ <dynamicField name="*_s" type="string" indexed="true" stored="true" />
374
+ <dynamicField name="*_ss" type="string" indexed="true" stored="true" multiValued="true" />
375
+ <dynamicField name="*_l" type="long" indexed="true" stored="true" />
376
+ <dynamicField name="*_ls" type="long" indexed="true" stored="true" multiValued="true" />
377
+ <dynamicField name="*_t" type="text_general" indexed="true" stored="true" />
378
+ <dynamicField name="*_txt" type="text_general" indexed="true" stored="true" />
379
+ <dynamicField name="*_b" type="boolean" indexed="true" stored="true" />
380
+ <dynamicField name="*_bs" type="boolean" indexed="true" stored="true" multiValued="true" />
381
+ <dynamicField name="*_f" type="float" indexed="true" stored="true" />
382
+ <dynamicField name="*_fs" type="float" indexed="true" stored="true" multiValued="true" />
383
+ <dynamicField name="*_d" type="double" indexed="true" stored="true" />
384
+ <dynamicField name="*_ds" type="double" indexed="true" stored="true" multiValued="true" />
385
+ <dynamicField name="*_coordinate" type="double" indexed="true" stored="false" />
386
+ <dynamicField name="*_dt" type="date" indexed="true" stored="true" />
387
+ <dynamicField name="*_dts" type="date" indexed="true" stored="true" multiValued="true" />
388
+ <dynamicField name="*_p" type="location" indexed="true" stored="true" />
389
+ <dynamicField name="*_ti" type="int" indexed="true" stored="true" />
390
+ <dynamicField name="*_tis" type="int" indexed="true" stored="true" multiValued="true" />
391
+ <dynamicField name="*_tl" type="long" indexed="true" stored="true" />
392
+ <dynamicField name="*_tls" type="long" indexed="true" stored="true" multiValued="true" />
393
+ <dynamicField name="*_tf" type="float" indexed="true" stored="true" />
394
+ <dynamicField name="*_tfs" type="float" indexed="true" stored="true" multiValued="true" />
395
+ <dynamicField name="*_td" type="double" indexed="true" stored="true" />
396
+ <dynamicField name="*_tds" type="double" indexed="true" stored="true" multiValued="true" />
397
+ <dynamicField name="*_tdt" type="date" indexed="true" stored="true" />
398
+ <dynamicField name="*_tdts" type="date" indexed="true" stored="true" multiValued="true" />
399
+ <dynamicField name="ignored_*" type="ignored" multiValued="true" />
400
+ <dynamicField name="attr_*" type="text_general" indexed="true" stored="true" multiValued="true" />
401
+ <dynamicField name="random_*" type="random" />
402
+
403
+ <dynamicField name="*_ws" type="text_ws" indexed="true" stored="true" />
404
+ <dynamicField name="*_txt_en" type="text_en" indexed="true" stored="true" />
405
+ <dynamicField name="*_txt_en_split" type="text_en_splitting" indexed="true" stored="true" />
406
+ <dynamicField name="*_txt_en_split_tight" type="text_en_splitting_tight" indexed="true" stored="true" />
407
+ <dynamicField name="*_txt_rev" type="text_general_rev" indexed="true" stored="true" />
408
+ <dynamicField name="*_phon_en" type="phonetic_en" indexed="true" stored="true" />
409
+ <dynamicField name="*_s_lower" type="lowercase" indexed="true" stored="true" />
410
+ <dynamicField name="*_descendent_path" type="descendent_path" indexed="true" stored="true" />
411
+ <dynamicField name="*_ancestor_path" type="ancestor_path" indexed="true" stored="true" />
412
+ <dynamicField name="*_point" type="point" indexed="true" stored="true" />
413
+ <dynamicField name="*_txt_ga" type="text_ga" indexed="true" stored="true" />
414
+ </fields>
415
+
416
+ <uniqueKey>id</uniqueKey>
417
+
418
+ <!-- TODO: Remove all copyFields where the source is indexed as text and adjust solrconfig.xml
419
+ to also search in those fields (edismax parser qf) -->
420
+ <copyField source="author" dest="text" />
421
+ <copyField source="keywords" dest="text" />
422
+ <copyField source="wct_title" dest="text" />
423
+ <copyField source="wct_description" dest="text" />
424
+ <copyField source="content" dest="text" />
425
+ <copyField source="url_norm" dest="url_search" />
426
+ <copyField source="resourcename" dest="resourcename_facet"/>
427
+
428
+ <types>
429
+ <!-- Guiding principles:
430
+
431
+ Atomic types are single-valued indexed & docValues, but not stored. This allows for low-cost faceting,
432
+ grouping and sorting. The downside is a performance penalty on document retrieval where a full document
433
+ takes longer to retrieve. Enabling stored speeds up retrieval at the cost of increased index size.
434
+
435
+ Text types are single-valued indexed & stored, but not docValued (DV is not currently possible for Text).
436
+
437
+ Deviations are normally handled by overriding for the specific fields
438
+ -->
439
+
440
+ <fieldType name="string" class="solr.StrField" indexed="true" docValues="true" stored="false" multiValued="false" sortMissingLast="true" />
441
+ <fieldType name="boolean" class="solr.BoolField" indexed="true" docValues="true" stored="false" multiValued="false" sortMissingLast="true" />
442
+ <fieldType name="int" class="solr.IntPointField" indexed="true" docValues="true" stored="false" multiValued="false" sortMissingLast="true" />
443
+ <fieldType name="float" class="solr.FloatPointField" indexed="true" docValues="true" stored="false" multiValued="false" sortMissingLast="true" />
444
+ <fieldType name="long" class="solr.LongPointField" indexed="true" docValues="true" stored="false" multiValued="false" sortMissingLast="true" />
445
+ <fieldType name="double" class="solr.DoublePointField" indexed="true" docValues="true" stored="false" multiValued="false" sortMissingLast="true" />
446
+ <fieldType name="date" class="solr.DatePointField" indexed="true" docValues="true" stored="false" multiValued="false" sortMissingLast="true" />
447
+ <fieldType name="binary" class="solr.BinaryField" indexed="false" docValues="false" stored="true" multiValued="false" />
448
+ <fieldType name="random" class="solr.RandomSortField" />
449
+
450
+ <fieldType name="text_ws" class="solr.TextField" positionIncrementGap="100" indexed="true" stored="true" multiValued="false">
451
+ <analyzer>
452
+ <tokenizer class="solr.WhitespaceTokenizerFactory" />
453
+ </analyzer>
454
+ </fieldType>
173
455
 
174
- <dynamicField name="*_ws" type="text_ws" indexed="true" stored="true"/>
175
- <dynamicField name="*_txt_en" type="text_en" indexed="true" stored="true"/>
176
- <dynamicField name="*_txt_en_split" type="text_en_splitting" indexed="true" stored="true"/>
177
- <dynamicField name="*_txt_en_split_tight" type="text_en_splitting_tight" indexed="true" stored="true"/>
178
- <dynamicField name="*_txt_rev" type="text_general_rev" indexed="true" stored="true"/>
179
- <dynamicField name="*_phon_en" type="phonetic_en" indexed="true" stored="true"/>
180
- <dynamicField name="*_s_lower" type="lowercase" indexed="true" stored="true"/>
181
- <dynamicField name="*_descendent_path" type="descendent_path" indexed="true" stored="true"/>
182
- <dynamicField name="*_ancestor_path" type="ancestor_path" indexed="true" stored="true"/>
183
- <dynamicField name="*_point" type="point" indexed="true" stored="true"/>
184
- <dynamicField name="*_txt_ga" type="text_ga" indexed="true" stored="true"/>
185
- </fields>
186
-
187
- <uniqueKey>id</uniqueKey>
188
-
189
- <copyField source="title" dest="text"/>
190
- <copyField source="author" dest="text"/>
191
- <copyField source="keywords" dest="text"/>
192
- <copyField source="description" dest="text"/>
193
- <copyField source="wct_title" dest="text"/>
194
- <copyField source="wct_description" dest="text"/>
195
- <copyField source="url" dest="text"/>
196
- <copyField source="content" dest="text"/>
197
- <copyField source="resourcename" dest="resourcename_facet"/>
198
-
199
- <types>
200
- <fieldType name="string" class="solr.StrField" sortMissingLast="true" />
201
- <fieldType name="strings" class="solr.StrField" sortMissingLast="true" multiValued="true" docValues="true" />
202
- <fieldType name="boolean" class="solr.BoolField" sortMissingLast="true"/>
203
- <fieldType name="booleans" class="solr.BoolField" sortMissingLast="true" multiValued="true"/>
204
- <fieldType name="int" class="solr.TrieIntField" docValues="true" precisionStep="0" positionIncrementGap="0"/>
205
- <fieldType name="float" class="solr.TrieFloatField" docValues="true" precisionStep="0" positionIncrementGap="0"/>
206
- <fieldType name="long" class="solr.TrieLongField" docValues="true" precisionStep="0" positionIncrementGap="0"/>
207
- <fieldType name="double" class="solr.TrieDoubleField" docValues="true" precisionStep="0" positionIncrementGap="0"/>
208
- <fieldType name="ints" class="solr.TrieIntField" docValues="true" precisionStep="0" positionIncrementGap="0" multiValued="true"/>
209
- <fieldType name="floats" class="solr.TrieFloatField" docValues="true" precisionStep="0" positionIncrementGap="0" multiValued="true"/>
210
- <fieldType name="longs" class="solr.TrieLongField" docValues="true" precisionStep="0" positionIncrementGap="0" multiValued="true"/>
211
- <fieldType name="doubles" class="solr.TrieDoubleField" docValues="true" precisionStep="0" positionIncrementGap="0" multiValued="true"/>
212
- <fieldType name="tint" class="solr.TrieIntField" docValues="true" precisionStep="8" positionIncrementGap="0"/>
213
- <fieldType name="tfloat" class="solr.TrieFloatField" docValues="true" precisionStep="8" positionIncrementGap="0"/>
214
- <fieldType name="tlong" class="solr.TrieLongField" docValues="true" precisionStep="8" positionIncrementGap="0"/>
215
- <fieldType name="tdouble" class="solr.TrieDoubleField" docValues="true" precisionStep="8" positionIncrementGap="0"/>
216
- <fieldType name="tints" class="solr.TrieIntField" docValues="true" precisionStep="8" positionIncrementGap="0" multiValued="true"/>
217
- <fieldType name="tfloats" class="solr.TrieFloatField" docValues="true" precisionStep="8" positionIncrementGap="0" multiValued="true"/>
218
- <fieldType name="tlongs" class="solr.TrieLongField" docValues="true" precisionStep="8" positionIncrementGap="0" multiValued="true"/>
219
- <fieldType name="tdoubles" class="solr.TrieDoubleField" docValues="true" precisionStep="8" positionIncrementGap="0" multiValued="true"/>
220
- <fieldType name="date" class="solr.TrieDateField" docValues="true" precisionStep="0" positionIncrementGap="0"/>
221
- <fieldType name="dates" class="solr.TrieDateField" docValues="true" precisionStep="0" positionIncrementGap="0" multiValued="true"/>
222
- <fieldType name="tdate" class="solr.TrieDateField" docValues="true" precisionStep="6" positionIncrementGap="0"/>
223
- <fieldType name="tdates" class="solr.TrieDateField" docValues="true" precisionStep="6" positionIncrementGap="0" multiValued="true"/>
224
- <fieldType name="binary" class="solr.BinaryField"/>
225
- <fieldType name="random" class="solr.RandomSortField" indexed="true" />
226
-
227
- <fieldType name="text_ws" class="solr.TextField" positionIncrementGap="100">
228
- <analyzer>
229
- <tokenizer class="solr.WhitespaceTokenizerFactory"/>
230
- </analyzer>
456
+ <!-- Used for parsing file paths, so that ["MOO BOO/FooBar_zoo.baz"] becomes ["moo", "boo", "foo", "bar", "zoo", "baz"] -->
457
+ <fieldType name="path" class="solr.TextField" positionIncrementGap="100" indexed="true" stored="true" multiValued="false">
458
+ <analyzer type="index">
459
+ <tokenizer class="solr.StandardTokenizerFactory" />
460
+ <filter class="solr.WordDelimiterFilterFactory" preserveOriginal="0" />
461
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords_path.txt" />
462
+ <filter class="solr.LowerCaseFilterFactory" />
463
+ </analyzer>
464
+ <analyzer type="query">
465
+ <tokenizer class="solr.StandardTokenizerFactory" />
466
+ <filter class="solr.WordDelimiterFilterFactory" preserveOriginal="0" />
467
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords_path.txt" />
468
+ <filter class="solr.LowerCaseFilterFactory" />
469
+ </analyzer>
231
470
  </fieldType>
232
471
 
233
- <fieldType name="text_general" class="solr.TextField" positionIncrementGap="100" multiValued="true">
234
- <analyzer type="index">
235
- <tokenizer class="solr.StandardTokenizerFactory"/>
236
- <filter class="solr.LowerCaseFilterFactory"/>
237
- </analyzer>
238
- <analyzer type="query">
239
- <tokenizer class="solr.StandardTokenizerFactory"/>
240
- <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
241
- <filter class="solr.LowerCaseFilterFactory"/>
242
- </analyzer>
472
+ <fieldType name="text_general" class="solr.TextField" positionIncrementGap="100" indexed="true" stored="true" multiValued="false">
473
+ <analyzer type="index">
474
+ <tokenizer class="solr.StandardTokenizerFactory" />
475
+ <filter class="solr.LowerCaseFilterFactory" />
476
+ </analyzer>
477
+ <analyzer type="query">
478
+ <tokenizer class="solr.StandardTokenizerFactory" />
479
+ <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true" />
480
+ <filter class="solr.LowerCaseFilterFactory" />
481
+ </analyzer>
243
482
  </fieldType>
244
483
 
245
- <fieldType name="text_en" class="solr.TextField" positionIncrementGap="100">
246
- <analyzer type="index">
247
- <tokenizer class="solr.StandardTokenizerFactory"/>
248
- <filter class="solr.LowerCaseFilterFactory"/>
249
- <filter class="solr.EnglishPossessiveFilterFactory"/>
250
- <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
251
- </analyzer>
252
- <analyzer type="query">
253
- <tokenizer class="solr.StandardTokenizerFactory"/>
254
- <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
255
- <filter class="solr.LowerCaseFilterFactory"/>
256
- <filter class="solr.EnglishPossessiveFilterFactory"/>
257
- <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
258
- </analyzer>
484
+ <fieldType name="text_en" class="solr.TextField" positionIncrementGap="100" indexed="true" stored="true" multiValued="false">
485
+ <analyzer type="index">
486
+ <tokenizer class="solr.StandardTokenizerFactory" />
487
+ <filter class="solr.LowerCaseFilterFactory" />
488
+ <filter class="solr.EnglishPossessiveFilterFactory" />
489
+ <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt" />
490
+ </analyzer>
491
+ <analyzer type="query">
492
+ <tokenizer class="solr.StandardTokenizerFactory" />
493
+ <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true" />
494
+ <filter class="solr.LowerCaseFilterFactory" />
495
+ <filter class="solr.EnglishPossessiveFilterFactory" />
496
+ <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt" />
497
+ </analyzer>
259
498
  </fieldType>
260
499
 
261
- <fieldType name="text_en_splitting" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
262
- <analyzer type="index">
263
- <tokenizer class="solr.WhitespaceTokenizerFactory"/>
264
- <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
265
- <filter class="solr.LowerCaseFilterFactory"/>
266
- <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
267
- </analyzer>
268
- <analyzer type="query">
269
- <tokenizer class="solr.WhitespaceTokenizerFactory"/>
270
- <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
271
- <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
272
- <filter class="solr.LowerCaseFilterFactory"/>
273
- <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
274
- </analyzer>
500
+ <fieldType name="text_en_splitting" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true" indexed="true" stored="true" multiValued="false">
501
+ <analyzer type="index">
502
+ <tokenizer class="solr.WhitespaceTokenizerFactory" />
503
+ <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1" />
504
+ <filter class="solr.LowerCaseFilterFactory" />
505
+ <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt" />
506
+ </analyzer>
507
+ <analyzer type="query">
508
+ <tokenizer class="solr.WhitespaceTokenizerFactory" />
509
+ <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true" />
510
+ <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1" />
511
+ <filter class="solr.LowerCaseFilterFactory" />
512
+ <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt" />
513
+ </analyzer>
275
514
  </fieldType>
276
515
 
277
- <fieldType name="text_en_splitting_tight" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
278
- <analyzer>
279
- <tokenizer class="solr.WhitespaceTokenizerFactory"/>
280
- <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
281
- <filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
282
- <filter class="solr.LowerCaseFilterFactory"/>
283
- <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
284
- <filter class="solr.EnglishMinimalStemFilterFactory"/>
285
- <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
286
- </analyzer>
516
+ <fieldType name="text_en_splitting_tight" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true" indexed="true" stored="true" multiValued="false">
517
+ <analyzer>
518
+ <tokenizer class="solr.WhitespaceTokenizerFactory" />
519
+ <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false" />
520
+ <filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0" />
521
+ <filter class="solr.LowerCaseFilterFactory" />
522
+ <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt" />
523
+ <filter class="solr.EnglishMinimalStemFilterFactory" />
524
+ <filter class="solr.RemoveDuplicatesTokenFilterFactory" />
525
+ </analyzer>
287
526
  </fieldType>
288
527
 
289
- <fieldType name="text_general_rev" class="solr.TextField" positionIncrementGap="100">
290
- <analyzer type="index">
291
- <tokenizer class="solr.StandardTokenizerFactory"/>
292
- <filter class="solr.LowerCaseFilterFactory"/>
293
- <filter class="solr.ReversedWildcardFilterFactory" withOriginal="true"
294
- maxPosAsterisk="3" maxPosQuestion="2" maxFractionAsterisk="0.33"/>
295
- </analyzer>
296
- <analyzer type="query">
297
- <tokenizer class="solr.StandardTokenizerFactory"/>
298
- <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
299
- <filter class="solr.LowerCaseFilterFactory"/>
300
- </analyzer>
528
+ <fieldType name="text_general_rev" class="solr.TextField" positionIncrementGap="100" indexed="true" stored="true" multiValued="false">
529
+ <analyzer type="index">
530
+ <tokenizer class="solr.StandardTokenizerFactory" />
531
+ <filter class="solr.LowerCaseFilterFactory" />
532
+ <filter class="solr.ReversedWildcardFilterFactory" withOriginal="true"
533
+ maxPosAsterisk="3" maxPosQuestion="2" maxFractionAsterisk="0.33" />
534
+ </analyzer>
535
+ <analyzer type="query">
536
+ <tokenizer class="solr.StandardTokenizerFactory" />
537
+ <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true" />
538
+ <filter class="solr.LowerCaseFilterFactory" />
539
+ </analyzer>
301
540
  </fieldType>
302
541
 
303
- <fieldType name="phonetic_en" stored="false" indexed="true" class="solr.TextField" >
304
- <analyzer>
305
- <tokenizer class="solr.StandardTokenizerFactory"/>
306
- <filter class="solr.DoubleMetaphoneFilterFactory" inject="false"/>
307
- </analyzer>
542
+ <fieldType name="phonetic_en" class="solr.TextField" indexed="true" stored="true" multiValued="false">
543
+ <analyzer>
544
+ <tokenizer class="solr.StandardTokenizerFactory" />
545
+ <filter class="solr.DoubleMetaphoneFilterFactory" inject="false" />
546
+ </analyzer>
308
547
  </fieldType>
309
548
 
310
- <fieldType name="lowercase" class="solr.TextField" positionIncrementGap="100">
311
- <analyzer>
312
- <tokenizer class="solr.KeywordTokenizerFactory"/>
313
- <filter class="solr.LowerCaseFilterFactory" />
314
- </analyzer>
549
+ <fieldType name="lowercase" class="solr.TextField" positionIncrementGap="100" indexed="true" stored="true" multiValued="false">
550
+ <analyzer>
551
+ <tokenizer class="solr.KeywordTokenizerFactory" />
552
+ <filter class="solr.LowerCaseFilterFactory" />
553
+ </analyzer>
315
554
  </fieldType>
316
555
 
317
556
  <fieldType name="descendent_path" class="solr.TextField">
318
- <analyzer type="index">
319
- <tokenizer class="solr.PathHierarchyTokenizerFactory" delimiter="/" />
320
- </analyzer>
321
- <analyzer type="query">
322
- <tokenizer class="solr.KeywordTokenizerFactory" />
323
- </analyzer>
557
+ <analyzer type="index">
558
+ <tokenizer class="solr.PathHierarchyTokenizerFactory" delimiter="/" />
559
+ </analyzer>
560
+ <analyzer type="query">
561
+ <tokenizer class="solr.KeywordTokenizerFactory" />
562
+ </analyzer>
324
563
  </fieldType>
325
564
 
326
565
  <fieldType name="ancestor_path" class="solr.TextField">
327
- <analyzer type="index">
328
- <tokenizer class="solr.KeywordTokenizerFactory" />
329
- </analyzer>
330
- <analyzer type="query">
331
- <tokenizer class="solr.PathHierarchyTokenizerFactory" delimiter="/" />
332
- </analyzer>
566
+ <analyzer type="index">
567
+ <tokenizer class="solr.KeywordTokenizerFactory" />
568
+ </analyzer>
569
+ <analyzer type="query">
570
+ <tokenizer class="solr.PathHierarchyTokenizerFactory" delimiter="/" />
571
+ </analyzer>
333
572
  </fieldType>
334
573
 
335
574
  <fieldType name="ignored" stored="false" indexed="false" docValues="false" multiValued="true" class="solr.StrField" />
336
- <fieldType name="point" class="solr.PointType" dimension="2" subFieldSuffix="_d"/>
337
- <fieldType name="location" class="solr.LatLonPointSpatialField" docValues="true"/>
575
+ <fieldType name="point" class="solr.PointType" dimension="2" subFieldSuffix="_d" />
576
+ <fieldType name="location" class="solr.LatLonPointSpatialField" indexed="true" stored="false" docValues="true" multiValued="false" />
338
577
 
339
578
  <fieldType name="text_ga" class="solr.TextField" positionIncrementGap="100">
340
- <analyzer>
341
- <tokenizer class="solr.StandardTokenizerFactory"/>
342
- <filter class="solr.ElisionFilterFactory" ignoreCase="true" articles="lang/contractions_ga.txt"/>
343
- <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/hyphenations_ga.txt"/>
344
- <filter class="solr.IrishLowerCaseFilterFactory"/>
345
- </analyzer>
579
+ <analyzer>
580
+ <tokenizer class="solr.StandardTokenizerFactory" />
581
+ <filter class="solr.ElisionFilterFactory" ignoreCase="true" articles="lang/contractions_ga.txt" />
582
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/hyphenations_ga.txt" />
583
+ <filter class="solr.IrishLowerCaseFilterFactory" />
584
+ </analyzer>
346
585
  </fieldType>
347
586
 
348
587
  <!-- BL UKWA: additional -->
349
- <fieldType name="literal_ngram" stored="false" indexed="true" class="solr.TextField">
350
- <analyzer>
351
- <tokenizer class="solr.WhitespaceTokenizerFactory"/>
352
- <filter class="solr.NGramFilterFactory" minGramSize="2" maxGramSize="5"/>
353
- </analyzer>
588
+ <fieldType name="literal_ngram" class="solr.TextField" indexed="true" stored="false" multiValued="false" >
589
+ <analyzer>
590
+ <tokenizer class="solr.WhitespaceTokenizerFactory" />
591
+ <filter class="solr.NGramFilterFactory" minGramSize="2" maxGramSize="5" />
592
+ </analyzer>
354
593
  </fieldType>
355
594
 
356
- <fieldType name="hex_text_shingle" class="solr.TextField" positionIncrementGap="100">
357
- <analyzer>
358
- <tokenizer class="solr.WhitespaceTokenizerFactory"/>
359
- <filter class="solr.ShingleFilterFactory" minShingleSize="4" maxShingleSize="8" outputUnigrams="false" outputUnigramsIfNoShingles="false" tokenSeparator=" "/>
360
- </analyzer>
595
+ <fieldType name="hex_text_shingle" class="solr.TextField" positionIncrementGap="100" indexed="true" stored="true" docValues="false">
596
+ <analyzer>
597
+ <tokenizer class="solr.WhitespaceTokenizerFactory" />
598
+ <filter class="solr.ShingleFilterFactory" minShingleSize="4" maxShingleSize="8" outputUnigrams="false" outputUnigramsIfNoShingles="false" tokenSeparator=" " />
599
+ </analyzer>
361
600
  </fieldType>
362
601
  <!--:BL UKWA -->
363
- </types>
602
+ </types>
364
603
  </schema>