solrclient 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. solrclient-0.4.0/PKG-INFO +16 -0
  2. solrclient-0.4.0/README.md +77 -0
  3. solrclient-0.4.0/SolrClient/__init__.py +10 -0
  4. solrclient-0.4.0/SolrClient/collections.py +183 -0
  5. solrclient-0.4.0/SolrClient/exceptions.py +41 -0
  6. solrclient-0.4.0/SolrClient/helpers/__init__.py +1 -0
  7. solrclient-0.4.0/SolrClient/helpers/reindexer.py +291 -0
  8. solrclient-0.4.0/SolrClient/indexq.py +479 -0
  9. solrclient-0.4.0/SolrClient/routers/__init__.py +0 -0
  10. solrclient-0.4.0/SolrClient/routers/aware.py +134 -0
  11. solrclient-0.4.0/SolrClient/routers/base.py +35 -0
  12. solrclient-0.4.0/SolrClient/routers/plain.py +20 -0
  13. solrclient-0.4.0/SolrClient/routers/pymmh3.py +451 -0
  14. solrclient-0.4.0/SolrClient/schema.py +133 -0
  15. solrclient-0.4.0/SolrClient/solrclient.py +355 -0
  16. solrclient-0.4.0/SolrClient/solrresp.py +407 -0
  17. solrclient-0.4.0/SolrClient/transport/__init__.py +2 -0
  18. solrclient-0.4.0/SolrClient/transport/transportbase.py +66 -0
  19. solrclient-0.4.0/SolrClient/transport/transportrequests.py +72 -0
  20. solrclient-0.4.0/SolrClient/zk.py +176 -0
  21. solrclient-0.4.0/setup.cfg +4 -0
  22. solrclient-0.4.0/setup.py +20 -0
  23. solrclient-0.4.0/solrclient.egg-info/PKG-INFO +16 -0
  24. solrclient-0.4.0/solrclient.egg-info/SOURCES.txt +36 -0
  25. solrclient-0.4.0/solrclient.egg-info/dependency_links.txt +1 -0
  26. solrclient-0.4.0/solrclient.egg-info/requires.txt +2 -0
  27. solrclient-0.4.0/solrclient.egg-info/top_level.txt +2 -0
  28. solrclient-0.4.0/test/RandomTestData.py +80 -0
  29. solrclient-0.4.0/test/__init__.py +2 -0
  30. solrclient-0.4.0/test/quick.py +17 -0
  31. solrclient-0.4.0/test/test_client.py +334 -0
  32. solrclient-0.4.0/test/test_collections.py +70 -0
  33. solrclient-0.4.0/test/test_config.py +28 -0
  34. solrclient-0.4.0/test/test_indexq.py +658 -0
  35. solrclient-0.4.0/test/test_reindexer.py +417 -0
  36. solrclient-0.4.0/test/test_resp.py +237 -0
  37. solrclient-0.4.0/test/test_schema.py +61 -0
  38. solrclient-0.4.0/test/test_zk.py +59 -0
@@ -0,0 +1,16 @@
1
+ Metadata-Version: 2.4
2
+ Name: solrclient
3
+ Version: 0.4.0
4
+ Summary: Python based client for Solr.
5
+ Home-page: https://github.com/moonlitesolutions/SolrClient
6
+ Author: Nick Vasilyev
7
+ Author-email: nick.vasilyev1@gmail.com
8
+ License: MIT
9
+ Requires-Dist: requests>=2.2.1
10
+ Requires-Dist: kazoo==2.6.1
11
+ Dynamic: author
12
+ Dynamic: author-email
13
+ Dynamic: home-page
14
+ Dynamic: license
15
+ Dynamic: requires-dist
16
+ Dynamic: summary
@@ -0,0 +1,77 @@
1
+ [![Documentation Status](https://readthedocs.org/projects/solrclient/badge/?version=latest)](http://solrclient.readthedocs.org/en/latest/?badge=latest)
2
+
3
+ # SolrClient
4
+ SolrClient 0.2.2
5
+ ----------
6
+ SolrClient is a simple python library for Solr; built in python3 with support for latest features of Solr 5 and 6. Development is heavily focused on indexing as well as parsing various query responses and returning them in native python data structures. Several helper classes will be built to automate querying and management of Solr clusters.
7
+
8
+ Enhancements in version 0.2.0:
9
+ * Basic parsing for json.facet output
10
+ * Better support for grouped results (SolrResponse)
11
+ * Other minor enhancements to SolrClient
12
+ * Fixed SolrClient.index method
13
+
14
+ Planned enhancements in version 0.3.0:
15
+ * Solr node query routing (by @ddorian)
16
+ * Streaming Expressions Support
17
+ *
18
+
19
+ Requirements
20
+ ----------
21
+ * python 3.3+
22
+ * requests library (http://docs.python-requests.org/en/latest/)
23
+ * Solr
24
+ * kazoo for working with zookeeper (optional)
25
+
26
+
27
+ Features
28
+ ----------
29
+ * Flexible and simple query mechanism
30
+ * Response Object to easily extract data from Solr Response
31
+ * Cursor Mark support
32
+ * Indexing (raw JSON, JSON Files, gzipped JSON)
33
+ * Specify multiple hosts/IPs for SolrCloud for redundancy
34
+ * Basic Managed Schema field management
35
+ * IndexManager for storing indexing documents off-line and batch indexing them
36
+
37
+ Getting Started
38
+ ----------
39
+ Installation:
40
+
41
+ pip install SolrClient
42
+
43
+ Basic usage:
44
+
45
+ >>> from SolrClient import SolrClient
46
+ >>> solr = SolrClient('http://localhost:8983/solr')
47
+ >>> res = solr.query('SolrClient_unittest',{
48
+ 'q':'product_name:Lorem',
49
+ 'facet':True,
50
+ 'facet.field':'facet_test',
51
+ })
52
+ >>> res.get_results_count()
53
+ 4
54
+ >>> res.get_facets()
55
+ {'facet_test': {'ipsum': 0, 'sit': 0, 'dolor': 2, 'amet,': 1, 'Lorem': 1}}
56
+ >>> res.get_facet_keys_as_list('facet_test')
57
+ ['ipsum', 'sit', 'dolor', 'amet,', 'Lorem']
58
+ >>> res.docs
59
+ [{'product_name_exact': 'orci. Morbi ipsum
60
+ ..... all the docs ....
61
+ 'consectetur Mauris dolor Lorem adipiscing'}]
62
+
63
+ See, easy.... you just need to know the Solr query syntax.
64
+
65
+
66
+ Roadmap
67
+ ----------
68
+ * Better test coverage
69
+ * Solr Streaming
70
+
71
+ Contributing
72
+ ----------
73
+ I've realized that that there isn't really a well maintained Solr Python library I liked so I put this together. Contributions (code, tests, documentation) are definitely welcome; if you have a question about development please open up an issue on github page. If you have a pull request, please make sure to add tests and that all of them pass before submitting. See tests README for testing resources.
74
+
75
+
76
+ Documentation:
77
+ http://solrclient.readthedocs.org/en/latest/
@@ -0,0 +1,10 @@
1
+ from .solrclient import SolrClient
2
+ from .solrresp import SolrResponse
3
+ from .schema import Schema
4
+ from .indexq import IndexQ
5
+ from .helpers import Reindexer
6
+ from .collections import Collections
7
+ from .zk import ZK
8
+ #This is the main project version. On new releases, it only needs to be updated here and in the README.md.
9
+ #Documentation and setup.py will pull from here.
10
+ __version__ = '0.4.0'
@@ -0,0 +1,183 @@
1
+ import json
2
+ import logging
3
+ from .transport import TransportRequests
4
+ from .schema import Schema
5
+ from .exceptions import *
6
+ from .solrresp import SolrResponse
7
+
8
+
9
+ class Collections():
10
+ """
11
+ Provides an interface to Solr Collections API.
12
+ """
13
+
14
+ def __init__(self, solr, log):
15
+
16
+ self.solr = solr
17
+ self.logger = log
18
+ self.solr_clients = {}
19
+
20
+ def api(self, action, args=None):
21
+ """
22
+ Sends a request to Solr Collections API.
23
+ Documentation is here: https://cwiki.apache.org/confluence/display/solr/Collections+API
24
+
25
+ :param string action: Name of the collection for the action
26
+ :param dict args: Dictionary of specific parameters for action
27
+ """
28
+ if args is None:
29
+ args = {}
30
+ args['action'] = action.upper()
31
+
32
+ try:
33
+ res, con_info = self.solr.transport.send_request(endpoint='admin/collections', params=args)
34
+ except Exception as e:
35
+ self.logger.error("Error querying SolrCloud Collections API. ")
36
+ self.logger.exception(e)
37
+ raise e
38
+
39
+ if 'responseHeader' in res and res['responseHeader']['status'] == 0:
40
+ return res, con_info
41
+ else:
42
+ raise SolrError("Error Issuing Collections API Call for: {} +".format(con_info, res))
43
+
44
+ def clusterstatus(self):
45
+ """
46
+ Returns a slightly slimmed down version of the clusterstatus api command. It also gets count of documents in each shard on each replica and returns
47
+ it as doc_count key for each replica.
48
+
49
+ """
50
+
51
+ res = self.cluster_status_raw()
52
+
53
+ cluster = res['cluster']['collections']
54
+ out = {}
55
+ try:
56
+ for collection in cluster:
57
+ out[collection] = {}
58
+ for shard in cluster[collection]['shards']:
59
+ out[collection][shard] = {}
60
+ for replica in cluster[collection]['shards'][shard]['replicas']:
61
+ out[collection][shard][replica] = cluster[collection]['shards'][shard]['replicas'][replica]
62
+ if out[collection][shard][replica]['state'] != 'active':
63
+ out[collection][shard][replica]['doc_count'] = False
64
+ else:
65
+ out[collection][shard][replica]['doc_count'] = self._get_collection_counts(
66
+ out[collection][shard][replica])
67
+ except Exception as e:
68
+ self.logger.error("Couldn't parse response from clusterstatus API call")
69
+ self.logger.exception(e)
70
+
71
+ return out
72
+
73
+ def cluster_status_raw(self, **kwargs):
74
+ """
75
+ Returns raw output of the clusterstatus api command.
76
+
77
+ """
78
+ res, con_info = self.api('clusterstatus', **kwargs)
79
+ return res
80
+
81
+ def exists(self, collection):
82
+ """
83
+ Return True if a collection exists.
84
+ """
85
+ all_collections = self.list()
86
+ if collection in all_collections:
87
+ return True
88
+
89
+ def list(self):
90
+ """
91
+ Returns a list[string] of all collection names on the cluster.
92
+ """
93
+ res, info = self.api('LIST')
94
+ collections = res['collections']
95
+ return collections
96
+
97
+ def create(self, name, numShards, params=None):
98
+ """
99
+ Create a new collection.
100
+ """
101
+ if params is None:
102
+ params = {}
103
+ params.update(
104
+ name=name,
105
+ numShards=numShards
106
+ )
107
+ return self.api('CREATE', params)
108
+
109
+ def _get_collection_counts(self, core_data):
110
+ """
111
+ Queries each core to get individual counts for each core for each shard.
112
+ """
113
+ if core_data['base_url'] not in self.solr_clients:
114
+ from SolrClient import SolrClient
115
+ self.solr_clients['base_url'] = SolrClient(core_data['base_url'], log=self.logger)
116
+ try:
117
+ return self.solr_clients['base_url'].query(core_data['core'],
118
+ {'q': '*:*',
119
+ 'rows': 0,
120
+ 'distrib': 'false',
121
+ }).get_num_found()
122
+ except Exception as e:
123
+ self.logger.error("Couldn't get Counts for {}/{}".format(core_data['base_url'], core_data['core']))
124
+ self.logger.exception(e)
125
+ return False
126
+
127
+ def _for_core(self, cluster_resp=None):
128
+ if cluster_resp is None:
129
+ cluster_resp = self.clusterstatus()
130
+ for collection in cluster_resp:
131
+ for shard in cluster_resp[collection]:
132
+ for core in cluster_resp[collection][shard]:
133
+ yield collection, shard, core, cluster_resp[collection][shard][core]
134
+
135
+ def _for_shard(self, cluster_resp=None):
136
+ if cluster_resp is None:
137
+ cluster_resp = self.clusterstatus()
138
+ for collection in cluster_resp:
139
+ for shard in cluster_resp[collection]:
140
+ yield collection, shard, cluster_resp[collection][shard]
141
+
142
+ def check_status(self, ignore=(), status=None):
143
+ """
144
+ Checks status of each collection and shard to make sure that:
145
+ a) Cluster state is active
146
+ b) Number of docs matches across replicas for a given shard.
147
+ Returns a dict of results for custom alerting.
148
+ """
149
+ self.SHARD_CHECKS = [
150
+ {'check_msg': 'Bad Core Count Check', 'f': self._check_shard_count},
151
+ {'check_msg': 'Bad Shard Cluster Status', 'f': self._check_shard_status}
152
+ ]
153
+ if status is None:
154
+ status = self.clusterstatus()
155
+ out = {}
156
+ for collection in status:
157
+ out[collection] = {}
158
+ out[collection]['coll_status'] = True # Means it's fine
159
+ out[collection]['coll_messages'] = []
160
+ for shard in status[collection]:
161
+ self.logger.debug("Checking {}/{}".format(collection, shard))
162
+ s_dict = status[collection][shard]
163
+ for check in self.SHARD_CHECKS:
164
+ if check['check_msg'] in ignore:
165
+ continue
166
+ res = check['f'](s_dict)
167
+ if not res:
168
+ out[collection]['coll_status'] = False
169
+ if check['check_msg'] not in out[collection]['coll_messages']:
170
+ out[collection]['coll_messages'].append(check['check_msg'])
171
+ self.logger.debug(s_dict)
172
+ return out
173
+
174
+ def _check_shard_count(self, cores_dict):
175
+ if len(set([cores_dict[core]['doc_count'] for core in cores_dict])) > 1:
176
+ return False
177
+ return True
178
+
179
+ def _check_shard_status(self, cores_dict):
180
+ for core in cores_dict:
181
+ if cores_dict[core]['state'] != 'active':
182
+ return False
183
+ return True
@@ -0,0 +1,41 @@
1
+ class SolrError(Exception):
2
+ """
3
+ Class to handle any issues that Solr Reports
4
+ """
5
+
6
+
7
+ class SolrResponseError(SolrError):
8
+ """
9
+ Errors relatd to parsing Solr Response
10
+ """
11
+
12
+
13
+ class ConnectionError(SolrError):
14
+ """
15
+ Errors connecting to Solr
16
+ """
17
+
18
+
19
+ class ZookeeperError(SolrError):
20
+ """
21
+ Errors connecting to Zookeeper
22
+ """
23
+
24
+
25
+ class NotFoundError(SolrError):
26
+ """
27
+ When a document wasn't found
28
+ """
29
+
30
+
31
+ class MinRfError(SolrError):
32
+ """
33
+ When an index request didn't satisfy the required min_rf
34
+ """
35
+ rf = None
36
+ min_rf = None
37
+
38
+ def __init__(self, message, rf, min_rf, **kwargs):
39
+ self.rf = rf
40
+ self.min_rf = min_rf
41
+ super().__init__(self, message, **kwargs)
@@ -0,0 +1 @@
1
+ from .reindexer import Reindexer
@@ -0,0 +1,291 @@
1
+ import datetime
2
+ import logging
3
+ import sys
4
+ import gzip
5
+ import argparse
6
+ import os
7
+ import json
8
+ from datetime import datetime, timedelta
9
+ from time import time, sleep
10
+ from SolrClient import SolrClient, IndexQ
11
+
12
+
13
+ class Reindexer():
14
+ '''
15
+ Initiates the re-indexer.
16
+
17
+ :param source: An instance of SolrClient.
18
+ :param dest: An instance of SolrClient or an instance of IndexQ.
19
+ :param string source_coll: Source collection name.
20
+ :param string dest_coll: Destination collection name; only required if destination is SolrClient.
21
+ :param int rows: Number of items to get in each query; default is 1000, however you will probably want to increase it.
22
+ :param string date_field: String name of a Solr date field to use in sort and resume.
23
+ :param bool devel: Whenever to turn on super verbouse logging for development. Standard DEBUG should suffice for most developemnt.
24
+ :param bool per_shard: Will add distrib=false to each query to get the data. Use this only if you will be running multiple instances of this to get the rest of the shards.
25
+ :param list ignore_fields: What fields to exclude from Solr queries. This is important since if you pull them out, you won't be able to index the documents in.
26
+ By default, it will try to determine and exclude copy fields as well as _version_. Pass in your own list to override or set it to False to prevent it from doing anything.
27
+ '''
28
+ def __init__(self,
29
+ source,
30
+ dest,
31
+ source_coll=None,
32
+ dest_coll=None,
33
+ rows=1000,
34
+ date_field=None,
35
+ devel=False,
36
+ per_shard=False,
37
+ ignore_fields=['_version_'],
38
+ ):
39
+
40
+
41
+ self.log = logging.getLogger('reindexer')
42
+
43
+ self._source = source
44
+ self._source_coll = source_coll
45
+ self._dest = dest
46
+ self._dest_coll = dest_coll
47
+ self._rows = rows
48
+ self._date_field = date_field
49
+ self._per_shard = per_shard
50
+ self._items_processed = 0
51
+ self._devel = devel
52
+ self._ignore_fields = ignore_fields
53
+
54
+
55
+ #Determine what source and destination should be
56
+ if type(source) is SolrClient and source_coll:
57
+ self._getter = self._from_solr
58
+ #Maybe break this out later for the sake of testing
59
+ if type(self._ignore_fields) is list and len(self._ignore_fields) == 1:
60
+ self._ignore_fields.extend(self._get_copy_fields())
61
+
62
+ elif type(source) is str and os.path.isdir(source):
63
+ self._getter = self._from_json
64
+ else:
65
+ raise ValueError("Incorrect Source Specified. Pass either a directory with json files or source SolrClient \
66
+ instance with the name of the collection.")
67
+
68
+ if type(self._dest) is SolrClient and self._dest_coll:
69
+ self._putter = self._to_solr
70
+ elif type(dest) is IndexQ:
71
+ self._putter = self._to_IndexQ
72
+ else:
73
+ raise ValueError("Incorrect Destination Specified. Pass either a directory with json files or destination SolrClient \
74
+ instance with the name of the collection.")
75
+ self.log.info("Reindexer created succesfully. ")
76
+
77
+
78
+ def _get_copy_fields(self):
79
+ if self._devel:
80
+ self.log.debug("Getting additional copy fields to exclude")
81
+ self.log.debug(self._source.schema.get_schema_copyfields(self._source_coll))
82
+ fields = [field['dest'] for field in self._source.schema.get_schema_copyfields(self._source_coll)]
83
+ self.log.info("Field exclusions are: {}".format(", ".join(fields)))
84
+ return fields
85
+
86
+
87
+ def reindex(self, fq= [], **kwargs):
88
+ '''
89
+ Starts Reindexing Process. All parameter arguments will be passed down to the getter function.
90
+ :param string fq: FilterQuery to pass to source Solr to retrieve items. This can be used to limit the results.
91
+ '''
92
+ for items in self._getter(fq=fq, **kwargs):
93
+ self._putter(items)
94
+ if type(self._dest) is SolrClient and self._dest_coll:
95
+ self.log.info("Finished Indexing, sending a commit")
96
+ self._dest.commit(self._dest_coll, openSearcher=True)
97
+
98
+
99
+ def _from_solr(self, fq=[], report_frequency = 25):
100
+ '''
101
+ Method for retrieving batch data from Solr.
102
+ '''
103
+ cursor = '*'
104
+ stime = datetime.now()
105
+ query_count = 0
106
+ while True:
107
+ #Get data with starting cursorMark
108
+ query = self._get_query(cursor)
109
+ #Add FQ to the query. This is used by resume to filter on date fields and when specifying document subset.
110
+ #Not included in _get_query for more flexibiilty.
111
+
112
+ if fq:
113
+ if 'fq' in query:
114
+ [query['fq'].append(x) for x in fq]
115
+ else:
116
+ query['fq'] = fq
117
+
118
+ results = self._source.query(self._source_coll, query)
119
+ query_count += 1
120
+ if query_count % report_frequency == 0:
121
+ self.log.info("Processed {} Items in {} Seconds. Apprximately {} items/minute".format(
122
+ self._items_processed, int((datetime.now()-stime).seconds),
123
+ str(int(self._items_processed / ((datetime.now()-stime).seconds/60)))
124
+ ))
125
+
126
+ if results.get_results_count():
127
+ #If we got items back, get the new cursor and yield the docs
128
+ self._items_processed += results.get_results_count()
129
+ cursor = results.get_cursor()
130
+ #Remove ignore fields
131
+ docs = self._trim_fields(results.docs)
132
+ yield docs
133
+ if results.get_results_count() < self._rows:
134
+ #Less results than asked, probably done
135
+ break
136
+ else:
137
+ #No Results, probably done :)
138
+ self.log.debug("Got zero Results with cursor: {}".format(cursor))
139
+ break
140
+
141
+
142
+ def _trim_fields(self, docs):
143
+ '''
144
+ Removes ignore fields from the data that we got from Solr.
145
+ '''
146
+ for doc in docs:
147
+ for field in self._ignore_fields:
148
+ if field in doc:
149
+ del(doc[field])
150
+ return docs
151
+
152
+
153
+ def _get_query(self, cursor):
154
+ '''
155
+ Query tempalte for source Solr, sorts by id by default.
156
+ '''
157
+ query = {'q':'*:*',
158
+ 'sort':'id desc',
159
+ 'rows':self._rows,
160
+ 'cursorMark':cursor}
161
+ if self._date_field:
162
+ query['sort'] = "{} asc, id desc".format(self._date_field)
163
+ if self._per_shard:
164
+ query['distrib'] = 'false'
165
+ return query
166
+
167
+
168
+ def _to_IndexQ(self, data):
169
+ '''
170
+ Sends data to IndexQ instance.
171
+ '''
172
+ self._dest.add(data)
173
+
174
+
175
+ def _to_solr(self, data):
176
+ '''
177
+ Sends data to a Solr instance.
178
+ '''
179
+ return self._dest.index_json(self._dest_coll, json.dumps(data,sort_keys=True))
180
+
181
+
182
+ def _get_date_range_query(self, start_date, end_date, timespan= 'DAY', date_field= None):
183
+ '''
184
+ Gets counts of items per specified date range.
185
+ :param collection: Solr Collection to use.
186
+ :param timespan: Solr Date Math compliant value for faceting ex HOUR, MONTH, DAY
187
+ '''
188
+ if date_field is None:
189
+ date_field = self._date_field
190
+ query ={'q':'*:*',
191
+ 'rows':0,
192
+ 'facet':'true',
193
+ 'facet.range': date_field,
194
+ 'facet.range.gap': '+1{}'.format(timespan),
195
+ 'facet.range.end': '{}'.format(end_date),
196
+ 'facet.range.start': '{}'.format(start_date),
197
+ 'facet.range.include': 'all'
198
+ }
199
+ if self._per_shard:
200
+ query['distrib'] = 'false'
201
+ return query
202
+
203
+
204
+ def _get_edge_date(self, date_field, sort):
205
+ '''
206
+ This method is used to get start and end dates for the collection.
207
+ '''
208
+ return self._source.query(self._source_coll, {
209
+ 'q':'*:*',
210
+ 'rows':1,
211
+ 'fq':'+{}:*'.format(date_field),
212
+ 'sort':'{} {}'.format(date_field, sort)}).docs[0][date_field]
213
+
214
+
215
+ def _get_date_facet_counts(self, timespan, date_field, start_date=None, end_date=None):
216
+ '''
217
+ Returns Range Facet counts based on
218
+ '''
219
+ if 'DAY' not in timespan:
220
+ raise ValueError("At this time, only DAY date range increment is supported. Aborting..... ")
221
+
222
+ #Need to do this a bit better later. Don't like the string and date concatenations.
223
+ if not start_date:
224
+ start_date = self._get_edge_date(date_field, 'asc')
225
+ start_date = datetime.strptime(start_date,'%Y-%m-%dT%H:%M:%S.%fZ').date().isoformat()+'T00:00:00.000Z'
226
+ else:
227
+ start_date = start_date+'T00:00:00.000Z'
228
+
229
+ if not end_date:
230
+ end_date = self._get_edge_date(date_field, 'desc')
231
+ end_date = datetime.strptime(end_date,'%Y-%m-%dT%H:%M:%S.%fZ').date()
232
+ end_date += timedelta(days=1)
233
+ end_date = end_date.isoformat()+'T00:00:00.000Z'
234
+ else:
235
+ end_date = end_date+'T00:00:00.000Z'
236
+
237
+
238
+ self.log.info("Processing Items from {} to {}".format(start_date, end_date))
239
+
240
+ #Get facet counts for source and destination collections
241
+ source_facet = self._source.query(self._source_coll,
242
+ self._get_date_range_query(timespan=timespan, start_date=start_date, end_date=end_date)
243
+ ).get_facets_ranges()[date_field]
244
+ dest_facet = self._dest.query(
245
+ self._dest_coll, self._get_date_range_query(
246
+ timespan=timespan, start_date=start_date, end_date=end_date
247
+ )).get_facets_ranges()[date_field]
248
+ return source_facet, dest_facet
249
+
250
+
251
+ def resume(self, start_date=None, end_date=None, timespan='DAY', check= False):
252
+ '''
253
+ This method may help if the original run was interrupted for some reason. It will only work under the following conditions
254
+ * You have a date field that you can facet on
255
+ * Indexing was stopped for the duration of the copy
256
+
257
+ The way this tries to resume re-indexing is by running a date range facet on the source and destination collections. It then compares
258
+ the counts in both collections for each timespan specified. If the counts are different, it will re-index items for each range where
259
+ the counts are off. You can also pass in a start_date to only get items after a certain time period. Note that each date range will be indexed in
260
+ it's entirety, even if there is only one item missing.
261
+
262
+ Keep in mind this only checks the counts and not actual data. So make the indexes weren't modified between the reindexing execution and
263
+ running the resume operation.
264
+
265
+ :param start_date: Date to start indexing from. If not specified there will be no restrictions and all data will be processed. Note that
266
+ this value will be passed to Solr directly and not modified.
267
+ :param end_date: The date to index items up to. Solr Date Math compliant value for faceting; currenlty only DAY is supported.
268
+ :param timespan: Solr Date Math compliant value for faceting; currenlty only DAY is supported.
269
+ :param check: If set to True it will only log differences between the two collections without actually modifying the destination.
270
+ '''
271
+
272
+ if type(self._source) is not SolrClient or type(self._dest) is not SolrClient:
273
+ raise ValueError("To resume, both source and destination need to be Solr.")
274
+
275
+ source_facet, dest_facet = self._get_date_facet_counts(timespan, self._date_field, start_date=start_date, end_date=end_date)
276
+
277
+ for dt_range in sorted(source_facet):
278
+ if dt_range in dest_facet:
279
+ self.log.info("Date Range: {} Source: {} Destination:{} Difference:{}".format(
280
+ dt_range, source_facet[dt_range], dest_facet[dt_range], (source_facet[dt_range]-dest_facet[dt_range])))
281
+ if check:
282
+ continue
283
+ if source_facet[dt_range] > dest_facet[dt_range]:
284
+ #Kicks off reindexing with an additional FQ
285
+ self.reindex(fq=['{}:[{} TO {}]'.format(self._date_field, dt_range, dt_range+'+1{}'.format(timespan))])
286
+ self.log.info("Complete Date Range {}".format(dt_range))
287
+ else:
288
+ self.log.error("Something went wrong; destinationSource: {}".format(source_facet))
289
+ self.log.error("Destination: {}".format(dest_facet))
290
+ raise ValueError("Date Ranges don't match up")
291
+ self._dest.commit(self._dest_coll, openSearcher=True)