solrclient 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- solrclient-0.4.0/PKG-INFO +16 -0
- solrclient-0.4.0/README.md +77 -0
- solrclient-0.4.0/SolrClient/__init__.py +10 -0
- solrclient-0.4.0/SolrClient/collections.py +183 -0
- solrclient-0.4.0/SolrClient/exceptions.py +41 -0
- solrclient-0.4.0/SolrClient/helpers/__init__.py +1 -0
- solrclient-0.4.0/SolrClient/helpers/reindexer.py +291 -0
- solrclient-0.4.0/SolrClient/indexq.py +479 -0
- solrclient-0.4.0/SolrClient/routers/__init__.py +0 -0
- solrclient-0.4.0/SolrClient/routers/aware.py +134 -0
- solrclient-0.4.0/SolrClient/routers/base.py +35 -0
- solrclient-0.4.0/SolrClient/routers/plain.py +20 -0
- solrclient-0.4.0/SolrClient/routers/pymmh3.py +451 -0
- solrclient-0.4.0/SolrClient/schema.py +133 -0
- solrclient-0.4.0/SolrClient/solrclient.py +355 -0
- solrclient-0.4.0/SolrClient/solrresp.py +407 -0
- solrclient-0.4.0/SolrClient/transport/__init__.py +2 -0
- solrclient-0.4.0/SolrClient/transport/transportbase.py +66 -0
- solrclient-0.4.0/SolrClient/transport/transportrequests.py +72 -0
- solrclient-0.4.0/SolrClient/zk.py +176 -0
- solrclient-0.4.0/setup.cfg +4 -0
- solrclient-0.4.0/setup.py +20 -0
- solrclient-0.4.0/solrclient.egg-info/PKG-INFO +16 -0
- solrclient-0.4.0/solrclient.egg-info/SOURCES.txt +36 -0
- solrclient-0.4.0/solrclient.egg-info/dependency_links.txt +1 -0
- solrclient-0.4.0/solrclient.egg-info/requires.txt +2 -0
- solrclient-0.4.0/solrclient.egg-info/top_level.txt +2 -0
- solrclient-0.4.0/test/RandomTestData.py +80 -0
- solrclient-0.4.0/test/__init__.py +2 -0
- solrclient-0.4.0/test/quick.py +17 -0
- solrclient-0.4.0/test/test_client.py +334 -0
- solrclient-0.4.0/test/test_collections.py +70 -0
- solrclient-0.4.0/test/test_config.py +28 -0
- solrclient-0.4.0/test/test_indexq.py +658 -0
- solrclient-0.4.0/test/test_reindexer.py +417 -0
- solrclient-0.4.0/test/test_resp.py +237 -0
- solrclient-0.4.0/test/test_schema.py +61 -0
- solrclient-0.4.0/test/test_zk.py +59 -0
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: solrclient
|
|
3
|
+
Version: 0.4.0
|
|
4
|
+
Summary: Python based client for Solr.
|
|
5
|
+
Home-page: https://github.com/moonlitesolutions/SolrClient
|
|
6
|
+
Author: Nick Vasilyev
|
|
7
|
+
Author-email: nick.vasilyev1@gmail.com
|
|
8
|
+
License: MIT
|
|
9
|
+
Requires-Dist: requests>=2.2.1
|
|
10
|
+
Requires-Dist: kazoo==2.6.1
|
|
11
|
+
Dynamic: author
|
|
12
|
+
Dynamic: author-email
|
|
13
|
+
Dynamic: home-page
|
|
14
|
+
Dynamic: license
|
|
15
|
+
Dynamic: requires-dist
|
|
16
|
+
Dynamic: summary
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
[](http://solrclient.readthedocs.org/en/latest/?badge=latest)
|
|
2
|
+
|
|
3
|
+
# SolrClient
|
|
4
|
+
SolrClient 0.2.2
|
|
5
|
+
----------
|
|
6
|
+
SolrClient is a simple python library for Solr; built in python3 with support for latest features of Solr 5 and 6. Development is heavily focused on indexing as well as parsing various query responses and returning them in native python data structures. Several helper classes will be built to automate querying and management of Solr clusters.
|
|
7
|
+
|
|
8
|
+
Enhancements in version 0.2.0:
|
|
9
|
+
* Basic parsing for json.facet output
|
|
10
|
+
* Better support for grouped results (SolrResponse)
|
|
11
|
+
* Other minor enhancements to SolrClient
|
|
12
|
+
* Fixed SolrClient.index method
|
|
13
|
+
|
|
14
|
+
Planned enhancements in version 0.3.0:
|
|
15
|
+
* Solr node query routing (by @ddorian)
|
|
16
|
+
* Streaming Expressions Support
|
|
17
|
+
*
|
|
18
|
+
|
|
19
|
+
Requirements
|
|
20
|
+
----------
|
|
21
|
+
* python 3.3+
|
|
22
|
+
* requests library (http://docs.python-requests.org/en/latest/)
|
|
23
|
+
* Solr
|
|
24
|
+
* kazoo for working with zookeeper (optional)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
Features
|
|
28
|
+
----------
|
|
29
|
+
* Flexible and simple query mechanism
|
|
30
|
+
* Response Object to easily extract data from Solr Response
|
|
31
|
+
* Cursor Mark support
|
|
32
|
+
* Indexing (raw JSON, JSON Files, gzipped JSON)
|
|
33
|
+
* Specify multiple hosts/IPs for SolrCloud for redundancy
|
|
34
|
+
* Basic Managed Schema field management
|
|
35
|
+
* IndexManager for storing indexing documents off-line and batch indexing them
|
|
36
|
+
|
|
37
|
+
Getting Started
|
|
38
|
+
----------
|
|
39
|
+
Installation:
|
|
40
|
+
|
|
41
|
+
pip install SolrClient
|
|
42
|
+
|
|
43
|
+
Basic usage:
|
|
44
|
+
|
|
45
|
+
>>> from SolrClient import SolrClient
|
|
46
|
+
>>> solr = SolrClient('http://localhost:8983/solr')
|
|
47
|
+
>>> res = solr.query('SolrClient_unittest',{
|
|
48
|
+
'q':'product_name:Lorem',
|
|
49
|
+
'facet':True,
|
|
50
|
+
'facet.field':'facet_test',
|
|
51
|
+
})
|
|
52
|
+
>>> res.get_results_count()
|
|
53
|
+
4
|
|
54
|
+
>>> res.get_facets()
|
|
55
|
+
{'facet_test': {'ipsum': 0, 'sit': 0, 'dolor': 2, 'amet,': 1, 'Lorem': 1}}
|
|
56
|
+
>>> res.get_facet_keys_as_list('facet_test')
|
|
57
|
+
['ipsum', 'sit', 'dolor', 'amet,', 'Lorem']
|
|
58
|
+
>>> res.docs
|
|
59
|
+
[{'product_name_exact': 'orci. Morbi ipsum
|
|
60
|
+
..... all the docs ....
|
|
61
|
+
'consectetur Mauris dolor Lorem adipiscing'}]
|
|
62
|
+
|
|
63
|
+
See, easy.... you just need to know the Solr query syntax.
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
Roadmap
|
|
67
|
+
----------
|
|
68
|
+
* Better test coverage
|
|
69
|
+
* Solr Streaming
|
|
70
|
+
|
|
71
|
+
Contributing
|
|
72
|
+
----------
|
|
73
|
+
I've realized that that there isn't really a well maintained Solr Python library I liked so I put this together. Contributions (code, tests, documentation) are definitely welcome; if you have a question about development please open up an issue on github page. If you have a pull request, please make sure to add tests and that all of them pass before submitting. See tests README for testing resources.
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
Documentation:
|
|
77
|
+
http://solrclient.readthedocs.org/en/latest/
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
from .solrclient import SolrClient
|
|
2
|
+
from .solrresp import SolrResponse
|
|
3
|
+
from .schema import Schema
|
|
4
|
+
from .indexq import IndexQ
|
|
5
|
+
from .helpers import Reindexer
|
|
6
|
+
from .collections import Collections
|
|
7
|
+
from .zk import ZK
|
|
8
|
+
#This is the main project version. On new releases, it only needs to be updated here and in the README.md.
|
|
9
|
+
#Documentation and setup.py will pull from here.
|
|
10
|
+
__version__ = '0.4.0'
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
from .transport import TransportRequests
|
|
4
|
+
from .schema import Schema
|
|
5
|
+
from .exceptions import *
|
|
6
|
+
from .solrresp import SolrResponse
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class Collections():
|
|
10
|
+
"""
|
|
11
|
+
Provides an interface to Solr Collections API.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
def __init__(self, solr, log):
|
|
15
|
+
|
|
16
|
+
self.solr = solr
|
|
17
|
+
self.logger = log
|
|
18
|
+
self.solr_clients = {}
|
|
19
|
+
|
|
20
|
+
def api(self, action, args=None):
|
|
21
|
+
"""
|
|
22
|
+
Sends a request to Solr Collections API.
|
|
23
|
+
Documentation is here: https://cwiki.apache.org/confluence/display/solr/Collections+API
|
|
24
|
+
|
|
25
|
+
:param string action: Name of the collection for the action
|
|
26
|
+
:param dict args: Dictionary of specific parameters for action
|
|
27
|
+
"""
|
|
28
|
+
if args is None:
|
|
29
|
+
args = {}
|
|
30
|
+
args['action'] = action.upper()
|
|
31
|
+
|
|
32
|
+
try:
|
|
33
|
+
res, con_info = self.solr.transport.send_request(endpoint='admin/collections', params=args)
|
|
34
|
+
except Exception as e:
|
|
35
|
+
self.logger.error("Error querying SolrCloud Collections API. ")
|
|
36
|
+
self.logger.exception(e)
|
|
37
|
+
raise e
|
|
38
|
+
|
|
39
|
+
if 'responseHeader' in res and res['responseHeader']['status'] == 0:
|
|
40
|
+
return res, con_info
|
|
41
|
+
else:
|
|
42
|
+
raise SolrError("Error Issuing Collections API Call for: {} +".format(con_info, res))
|
|
43
|
+
|
|
44
|
+
def clusterstatus(self):
|
|
45
|
+
"""
|
|
46
|
+
Returns a slightly slimmed down version of the clusterstatus api command. It also gets count of documents in each shard on each replica and returns
|
|
47
|
+
it as doc_count key for each replica.
|
|
48
|
+
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
res = self.cluster_status_raw()
|
|
52
|
+
|
|
53
|
+
cluster = res['cluster']['collections']
|
|
54
|
+
out = {}
|
|
55
|
+
try:
|
|
56
|
+
for collection in cluster:
|
|
57
|
+
out[collection] = {}
|
|
58
|
+
for shard in cluster[collection]['shards']:
|
|
59
|
+
out[collection][shard] = {}
|
|
60
|
+
for replica in cluster[collection]['shards'][shard]['replicas']:
|
|
61
|
+
out[collection][shard][replica] = cluster[collection]['shards'][shard]['replicas'][replica]
|
|
62
|
+
if out[collection][shard][replica]['state'] != 'active':
|
|
63
|
+
out[collection][shard][replica]['doc_count'] = False
|
|
64
|
+
else:
|
|
65
|
+
out[collection][shard][replica]['doc_count'] = self._get_collection_counts(
|
|
66
|
+
out[collection][shard][replica])
|
|
67
|
+
except Exception as e:
|
|
68
|
+
self.logger.error("Couldn't parse response from clusterstatus API call")
|
|
69
|
+
self.logger.exception(e)
|
|
70
|
+
|
|
71
|
+
return out
|
|
72
|
+
|
|
73
|
+
def cluster_status_raw(self, **kwargs):
|
|
74
|
+
"""
|
|
75
|
+
Returns raw output of the clusterstatus api command.
|
|
76
|
+
|
|
77
|
+
"""
|
|
78
|
+
res, con_info = self.api('clusterstatus', **kwargs)
|
|
79
|
+
return res
|
|
80
|
+
|
|
81
|
+
def exists(self, collection):
|
|
82
|
+
"""
|
|
83
|
+
Return True if a collection exists.
|
|
84
|
+
"""
|
|
85
|
+
all_collections = self.list()
|
|
86
|
+
if collection in all_collections:
|
|
87
|
+
return True
|
|
88
|
+
|
|
89
|
+
def list(self):
|
|
90
|
+
"""
|
|
91
|
+
Returns a list[string] of all collection names on the cluster.
|
|
92
|
+
"""
|
|
93
|
+
res, info = self.api('LIST')
|
|
94
|
+
collections = res['collections']
|
|
95
|
+
return collections
|
|
96
|
+
|
|
97
|
+
def create(self, name, numShards, params=None):
|
|
98
|
+
"""
|
|
99
|
+
Create a new collection.
|
|
100
|
+
"""
|
|
101
|
+
if params is None:
|
|
102
|
+
params = {}
|
|
103
|
+
params.update(
|
|
104
|
+
name=name,
|
|
105
|
+
numShards=numShards
|
|
106
|
+
)
|
|
107
|
+
return self.api('CREATE', params)
|
|
108
|
+
|
|
109
|
+
def _get_collection_counts(self, core_data):
|
|
110
|
+
"""
|
|
111
|
+
Queries each core to get individual counts for each core for each shard.
|
|
112
|
+
"""
|
|
113
|
+
if core_data['base_url'] not in self.solr_clients:
|
|
114
|
+
from SolrClient import SolrClient
|
|
115
|
+
self.solr_clients['base_url'] = SolrClient(core_data['base_url'], log=self.logger)
|
|
116
|
+
try:
|
|
117
|
+
return self.solr_clients['base_url'].query(core_data['core'],
|
|
118
|
+
{'q': '*:*',
|
|
119
|
+
'rows': 0,
|
|
120
|
+
'distrib': 'false',
|
|
121
|
+
}).get_num_found()
|
|
122
|
+
except Exception as e:
|
|
123
|
+
self.logger.error("Couldn't get Counts for {}/{}".format(core_data['base_url'], core_data['core']))
|
|
124
|
+
self.logger.exception(e)
|
|
125
|
+
return False
|
|
126
|
+
|
|
127
|
+
def _for_core(self, cluster_resp=None):
|
|
128
|
+
if cluster_resp is None:
|
|
129
|
+
cluster_resp = self.clusterstatus()
|
|
130
|
+
for collection in cluster_resp:
|
|
131
|
+
for shard in cluster_resp[collection]:
|
|
132
|
+
for core in cluster_resp[collection][shard]:
|
|
133
|
+
yield collection, shard, core, cluster_resp[collection][shard][core]
|
|
134
|
+
|
|
135
|
+
def _for_shard(self, cluster_resp=None):
|
|
136
|
+
if cluster_resp is None:
|
|
137
|
+
cluster_resp = self.clusterstatus()
|
|
138
|
+
for collection in cluster_resp:
|
|
139
|
+
for shard in cluster_resp[collection]:
|
|
140
|
+
yield collection, shard, cluster_resp[collection][shard]
|
|
141
|
+
|
|
142
|
+
def check_status(self, ignore=(), status=None):
|
|
143
|
+
"""
|
|
144
|
+
Checks status of each collection and shard to make sure that:
|
|
145
|
+
a) Cluster state is active
|
|
146
|
+
b) Number of docs matches across replicas for a given shard.
|
|
147
|
+
Returns a dict of results for custom alerting.
|
|
148
|
+
"""
|
|
149
|
+
self.SHARD_CHECKS = [
|
|
150
|
+
{'check_msg': 'Bad Core Count Check', 'f': self._check_shard_count},
|
|
151
|
+
{'check_msg': 'Bad Shard Cluster Status', 'f': self._check_shard_status}
|
|
152
|
+
]
|
|
153
|
+
if status is None:
|
|
154
|
+
status = self.clusterstatus()
|
|
155
|
+
out = {}
|
|
156
|
+
for collection in status:
|
|
157
|
+
out[collection] = {}
|
|
158
|
+
out[collection]['coll_status'] = True # Means it's fine
|
|
159
|
+
out[collection]['coll_messages'] = []
|
|
160
|
+
for shard in status[collection]:
|
|
161
|
+
self.logger.debug("Checking {}/{}".format(collection, shard))
|
|
162
|
+
s_dict = status[collection][shard]
|
|
163
|
+
for check in self.SHARD_CHECKS:
|
|
164
|
+
if check['check_msg'] in ignore:
|
|
165
|
+
continue
|
|
166
|
+
res = check['f'](s_dict)
|
|
167
|
+
if not res:
|
|
168
|
+
out[collection]['coll_status'] = False
|
|
169
|
+
if check['check_msg'] not in out[collection]['coll_messages']:
|
|
170
|
+
out[collection]['coll_messages'].append(check['check_msg'])
|
|
171
|
+
self.logger.debug(s_dict)
|
|
172
|
+
return out
|
|
173
|
+
|
|
174
|
+
def _check_shard_count(self, cores_dict):
|
|
175
|
+
if len(set([cores_dict[core]['doc_count'] for core in cores_dict])) > 1:
|
|
176
|
+
return False
|
|
177
|
+
return True
|
|
178
|
+
|
|
179
|
+
def _check_shard_status(self, cores_dict):
|
|
180
|
+
for core in cores_dict:
|
|
181
|
+
if cores_dict[core]['state'] != 'active':
|
|
182
|
+
return False
|
|
183
|
+
return True
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
class SolrError(Exception):
|
|
2
|
+
"""
|
|
3
|
+
Class to handle any issues that Solr Reports
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class SolrResponseError(SolrError):
|
|
8
|
+
"""
|
|
9
|
+
Errors relatd to parsing Solr Response
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ConnectionError(SolrError):
|
|
14
|
+
"""
|
|
15
|
+
Errors connecting to Solr
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class ZookeeperError(SolrError):
|
|
20
|
+
"""
|
|
21
|
+
Errors connecting to Zookeeper
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class NotFoundError(SolrError):
|
|
26
|
+
"""
|
|
27
|
+
When a document wasn't found
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class MinRfError(SolrError):
|
|
32
|
+
"""
|
|
33
|
+
When an index request didn't satisfy the required min_rf
|
|
34
|
+
"""
|
|
35
|
+
rf = None
|
|
36
|
+
min_rf = None
|
|
37
|
+
|
|
38
|
+
def __init__(self, message, rf, min_rf, **kwargs):
|
|
39
|
+
self.rf = rf
|
|
40
|
+
self.min_rf = min_rf
|
|
41
|
+
super().__init__(self, message, **kwargs)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .reindexer import Reindexer
|
|
@@ -0,0 +1,291 @@
|
|
|
1
|
+
import datetime
|
|
2
|
+
import logging
|
|
3
|
+
import sys
|
|
4
|
+
import gzip
|
|
5
|
+
import argparse
|
|
6
|
+
import os
|
|
7
|
+
import json
|
|
8
|
+
from datetime import datetime, timedelta
|
|
9
|
+
from time import time, sleep
|
|
10
|
+
from SolrClient import SolrClient, IndexQ
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class Reindexer():
|
|
14
|
+
'''
|
|
15
|
+
Initiates the re-indexer.
|
|
16
|
+
|
|
17
|
+
:param source: An instance of SolrClient.
|
|
18
|
+
:param dest: An instance of SolrClient or an instance of IndexQ.
|
|
19
|
+
:param string source_coll: Source collection name.
|
|
20
|
+
:param string dest_coll: Destination collection name; only required if destination is SolrClient.
|
|
21
|
+
:param int rows: Number of items to get in each query; default is 1000, however you will probably want to increase it.
|
|
22
|
+
:param string date_field: String name of a Solr date field to use in sort and resume.
|
|
23
|
+
:param bool devel: Whenever to turn on super verbouse logging for development. Standard DEBUG should suffice for most developemnt.
|
|
24
|
+
:param bool per_shard: Will add distrib=false to each query to get the data. Use this only if you will be running multiple instances of this to get the rest of the shards.
|
|
25
|
+
:param list ignore_fields: What fields to exclude from Solr queries. This is important since if you pull them out, you won't be able to index the documents in.
|
|
26
|
+
By default, it will try to determine and exclude copy fields as well as _version_. Pass in your own list to override or set it to False to prevent it from doing anything.
|
|
27
|
+
'''
|
|
28
|
+
def __init__(self,
|
|
29
|
+
source,
|
|
30
|
+
dest,
|
|
31
|
+
source_coll=None,
|
|
32
|
+
dest_coll=None,
|
|
33
|
+
rows=1000,
|
|
34
|
+
date_field=None,
|
|
35
|
+
devel=False,
|
|
36
|
+
per_shard=False,
|
|
37
|
+
ignore_fields=['_version_'],
|
|
38
|
+
):
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
self.log = logging.getLogger('reindexer')
|
|
42
|
+
|
|
43
|
+
self._source = source
|
|
44
|
+
self._source_coll = source_coll
|
|
45
|
+
self._dest = dest
|
|
46
|
+
self._dest_coll = dest_coll
|
|
47
|
+
self._rows = rows
|
|
48
|
+
self._date_field = date_field
|
|
49
|
+
self._per_shard = per_shard
|
|
50
|
+
self._items_processed = 0
|
|
51
|
+
self._devel = devel
|
|
52
|
+
self._ignore_fields = ignore_fields
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
#Determine what source and destination should be
|
|
56
|
+
if type(source) is SolrClient and source_coll:
|
|
57
|
+
self._getter = self._from_solr
|
|
58
|
+
#Maybe break this out later for the sake of testing
|
|
59
|
+
if type(self._ignore_fields) is list and len(self._ignore_fields) == 1:
|
|
60
|
+
self._ignore_fields.extend(self._get_copy_fields())
|
|
61
|
+
|
|
62
|
+
elif type(source) is str and os.path.isdir(source):
|
|
63
|
+
self._getter = self._from_json
|
|
64
|
+
else:
|
|
65
|
+
raise ValueError("Incorrect Source Specified. Pass either a directory with json files or source SolrClient \
|
|
66
|
+
instance with the name of the collection.")
|
|
67
|
+
|
|
68
|
+
if type(self._dest) is SolrClient and self._dest_coll:
|
|
69
|
+
self._putter = self._to_solr
|
|
70
|
+
elif type(dest) is IndexQ:
|
|
71
|
+
self._putter = self._to_IndexQ
|
|
72
|
+
else:
|
|
73
|
+
raise ValueError("Incorrect Destination Specified. Pass either a directory with json files or destination SolrClient \
|
|
74
|
+
instance with the name of the collection.")
|
|
75
|
+
self.log.info("Reindexer created succesfully. ")
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def _get_copy_fields(self):
|
|
79
|
+
if self._devel:
|
|
80
|
+
self.log.debug("Getting additional copy fields to exclude")
|
|
81
|
+
self.log.debug(self._source.schema.get_schema_copyfields(self._source_coll))
|
|
82
|
+
fields = [field['dest'] for field in self._source.schema.get_schema_copyfields(self._source_coll)]
|
|
83
|
+
self.log.info("Field exclusions are: {}".format(", ".join(fields)))
|
|
84
|
+
return fields
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def reindex(self, fq= [], **kwargs):
|
|
88
|
+
'''
|
|
89
|
+
Starts Reindexing Process. All parameter arguments will be passed down to the getter function.
|
|
90
|
+
:param string fq: FilterQuery to pass to source Solr to retrieve items. This can be used to limit the results.
|
|
91
|
+
'''
|
|
92
|
+
for items in self._getter(fq=fq, **kwargs):
|
|
93
|
+
self._putter(items)
|
|
94
|
+
if type(self._dest) is SolrClient and self._dest_coll:
|
|
95
|
+
self.log.info("Finished Indexing, sending a commit")
|
|
96
|
+
self._dest.commit(self._dest_coll, openSearcher=True)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _from_solr(self, fq=[], report_frequency = 25):
|
|
100
|
+
'''
|
|
101
|
+
Method for retrieving batch data from Solr.
|
|
102
|
+
'''
|
|
103
|
+
cursor = '*'
|
|
104
|
+
stime = datetime.now()
|
|
105
|
+
query_count = 0
|
|
106
|
+
while True:
|
|
107
|
+
#Get data with starting cursorMark
|
|
108
|
+
query = self._get_query(cursor)
|
|
109
|
+
#Add FQ to the query. This is used by resume to filter on date fields and when specifying document subset.
|
|
110
|
+
#Not included in _get_query for more flexibiilty.
|
|
111
|
+
|
|
112
|
+
if fq:
|
|
113
|
+
if 'fq' in query:
|
|
114
|
+
[query['fq'].append(x) for x in fq]
|
|
115
|
+
else:
|
|
116
|
+
query['fq'] = fq
|
|
117
|
+
|
|
118
|
+
results = self._source.query(self._source_coll, query)
|
|
119
|
+
query_count += 1
|
|
120
|
+
if query_count % report_frequency == 0:
|
|
121
|
+
self.log.info("Processed {} Items in {} Seconds. Apprximately {} items/minute".format(
|
|
122
|
+
self._items_processed, int((datetime.now()-stime).seconds),
|
|
123
|
+
str(int(self._items_processed / ((datetime.now()-stime).seconds/60)))
|
|
124
|
+
))
|
|
125
|
+
|
|
126
|
+
if results.get_results_count():
|
|
127
|
+
#If we got items back, get the new cursor and yield the docs
|
|
128
|
+
self._items_processed += results.get_results_count()
|
|
129
|
+
cursor = results.get_cursor()
|
|
130
|
+
#Remove ignore fields
|
|
131
|
+
docs = self._trim_fields(results.docs)
|
|
132
|
+
yield docs
|
|
133
|
+
if results.get_results_count() < self._rows:
|
|
134
|
+
#Less results than asked, probably done
|
|
135
|
+
break
|
|
136
|
+
else:
|
|
137
|
+
#No Results, probably done :)
|
|
138
|
+
self.log.debug("Got zero Results with cursor: {}".format(cursor))
|
|
139
|
+
break
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def _trim_fields(self, docs):
|
|
143
|
+
'''
|
|
144
|
+
Removes ignore fields from the data that we got from Solr.
|
|
145
|
+
'''
|
|
146
|
+
for doc in docs:
|
|
147
|
+
for field in self._ignore_fields:
|
|
148
|
+
if field in doc:
|
|
149
|
+
del(doc[field])
|
|
150
|
+
return docs
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def _get_query(self, cursor):
|
|
154
|
+
'''
|
|
155
|
+
Query tempalte for source Solr, sorts by id by default.
|
|
156
|
+
'''
|
|
157
|
+
query = {'q':'*:*',
|
|
158
|
+
'sort':'id desc',
|
|
159
|
+
'rows':self._rows,
|
|
160
|
+
'cursorMark':cursor}
|
|
161
|
+
if self._date_field:
|
|
162
|
+
query['sort'] = "{} asc, id desc".format(self._date_field)
|
|
163
|
+
if self._per_shard:
|
|
164
|
+
query['distrib'] = 'false'
|
|
165
|
+
return query
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def _to_IndexQ(self, data):
|
|
169
|
+
'''
|
|
170
|
+
Sends data to IndexQ instance.
|
|
171
|
+
'''
|
|
172
|
+
self._dest.add(data)
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def _to_solr(self, data):
|
|
176
|
+
'''
|
|
177
|
+
Sends data to a Solr instance.
|
|
178
|
+
'''
|
|
179
|
+
return self._dest.index_json(self._dest_coll, json.dumps(data,sort_keys=True))
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def _get_date_range_query(self, start_date, end_date, timespan= 'DAY', date_field= None):
|
|
183
|
+
'''
|
|
184
|
+
Gets counts of items per specified date range.
|
|
185
|
+
:param collection: Solr Collection to use.
|
|
186
|
+
:param timespan: Solr Date Math compliant value for faceting ex HOUR, MONTH, DAY
|
|
187
|
+
'''
|
|
188
|
+
if date_field is None:
|
|
189
|
+
date_field = self._date_field
|
|
190
|
+
query ={'q':'*:*',
|
|
191
|
+
'rows':0,
|
|
192
|
+
'facet':'true',
|
|
193
|
+
'facet.range': date_field,
|
|
194
|
+
'facet.range.gap': '+1{}'.format(timespan),
|
|
195
|
+
'facet.range.end': '{}'.format(end_date),
|
|
196
|
+
'facet.range.start': '{}'.format(start_date),
|
|
197
|
+
'facet.range.include': 'all'
|
|
198
|
+
}
|
|
199
|
+
if self._per_shard:
|
|
200
|
+
query['distrib'] = 'false'
|
|
201
|
+
return query
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def _get_edge_date(self, date_field, sort):
|
|
205
|
+
'''
|
|
206
|
+
This method is used to get start and end dates for the collection.
|
|
207
|
+
'''
|
|
208
|
+
return self._source.query(self._source_coll, {
|
|
209
|
+
'q':'*:*',
|
|
210
|
+
'rows':1,
|
|
211
|
+
'fq':'+{}:*'.format(date_field),
|
|
212
|
+
'sort':'{} {}'.format(date_field, sort)}).docs[0][date_field]
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
def _get_date_facet_counts(self, timespan, date_field, start_date=None, end_date=None):
|
|
216
|
+
'''
|
|
217
|
+
Returns Range Facet counts based on
|
|
218
|
+
'''
|
|
219
|
+
if 'DAY' not in timespan:
|
|
220
|
+
raise ValueError("At this time, only DAY date range increment is supported. Aborting..... ")
|
|
221
|
+
|
|
222
|
+
#Need to do this a bit better later. Don't like the string and date concatenations.
|
|
223
|
+
if not start_date:
|
|
224
|
+
start_date = self._get_edge_date(date_field, 'asc')
|
|
225
|
+
start_date = datetime.strptime(start_date,'%Y-%m-%dT%H:%M:%S.%fZ').date().isoformat()+'T00:00:00.000Z'
|
|
226
|
+
else:
|
|
227
|
+
start_date = start_date+'T00:00:00.000Z'
|
|
228
|
+
|
|
229
|
+
if not end_date:
|
|
230
|
+
end_date = self._get_edge_date(date_field, 'desc')
|
|
231
|
+
end_date = datetime.strptime(end_date,'%Y-%m-%dT%H:%M:%S.%fZ').date()
|
|
232
|
+
end_date += timedelta(days=1)
|
|
233
|
+
end_date = end_date.isoformat()+'T00:00:00.000Z'
|
|
234
|
+
else:
|
|
235
|
+
end_date = end_date+'T00:00:00.000Z'
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
self.log.info("Processing Items from {} to {}".format(start_date, end_date))
|
|
239
|
+
|
|
240
|
+
#Get facet counts for source and destination collections
|
|
241
|
+
source_facet = self._source.query(self._source_coll,
|
|
242
|
+
self._get_date_range_query(timespan=timespan, start_date=start_date, end_date=end_date)
|
|
243
|
+
).get_facets_ranges()[date_field]
|
|
244
|
+
dest_facet = self._dest.query(
|
|
245
|
+
self._dest_coll, self._get_date_range_query(
|
|
246
|
+
timespan=timespan, start_date=start_date, end_date=end_date
|
|
247
|
+
)).get_facets_ranges()[date_field]
|
|
248
|
+
return source_facet, dest_facet
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def resume(self, start_date=None, end_date=None, timespan='DAY', check= False):
|
|
252
|
+
'''
|
|
253
|
+
This method may help if the original run was interrupted for some reason. It will only work under the following conditions
|
|
254
|
+
* You have a date field that you can facet on
|
|
255
|
+
* Indexing was stopped for the duration of the copy
|
|
256
|
+
|
|
257
|
+
The way this tries to resume re-indexing is by running a date range facet on the source and destination collections. It then compares
|
|
258
|
+
the counts in both collections for each timespan specified. If the counts are different, it will re-index items for each range where
|
|
259
|
+
the counts are off. You can also pass in a start_date to only get items after a certain time period. Note that each date range will be indexed in
|
|
260
|
+
it's entirety, even if there is only one item missing.
|
|
261
|
+
|
|
262
|
+
Keep in mind this only checks the counts and not actual data. So make the indexes weren't modified between the reindexing execution and
|
|
263
|
+
running the resume operation.
|
|
264
|
+
|
|
265
|
+
:param start_date: Date to start indexing from. If not specified there will be no restrictions and all data will be processed. Note that
|
|
266
|
+
this value will be passed to Solr directly and not modified.
|
|
267
|
+
:param end_date: The date to index items up to. Solr Date Math compliant value for faceting; currenlty only DAY is supported.
|
|
268
|
+
:param timespan: Solr Date Math compliant value for faceting; currenlty only DAY is supported.
|
|
269
|
+
:param check: If set to True it will only log differences between the two collections without actually modifying the destination.
|
|
270
|
+
'''
|
|
271
|
+
|
|
272
|
+
if type(self._source) is not SolrClient or type(self._dest) is not SolrClient:
|
|
273
|
+
raise ValueError("To resume, both source and destination need to be Solr.")
|
|
274
|
+
|
|
275
|
+
source_facet, dest_facet = self._get_date_facet_counts(timespan, self._date_field, start_date=start_date, end_date=end_date)
|
|
276
|
+
|
|
277
|
+
for dt_range in sorted(source_facet):
|
|
278
|
+
if dt_range in dest_facet:
|
|
279
|
+
self.log.info("Date Range: {} Source: {} Destination:{} Difference:{}".format(
|
|
280
|
+
dt_range, source_facet[dt_range], dest_facet[dt_range], (source_facet[dt_range]-dest_facet[dt_range])))
|
|
281
|
+
if check:
|
|
282
|
+
continue
|
|
283
|
+
if source_facet[dt_range] > dest_facet[dt_range]:
|
|
284
|
+
#Kicks off reindexing with an additional FQ
|
|
285
|
+
self.reindex(fq=['{}:[{} TO {}]'.format(self._date_field, dt_range, dt_range+'+1{}'.format(timespan))])
|
|
286
|
+
self.log.info("Complete Date Range {}".format(dt_range))
|
|
287
|
+
else:
|
|
288
|
+
self.log.error("Something went wrong; destinationSource: {}".format(source_facet))
|
|
289
|
+
self.log.error("Destination: {}".format(dest_facet))
|
|
290
|
+
raise ValueError("Date Ranges don't match up")
|
|
291
|
+
self._dest.commit(self._dest_coll, openSearcher=True)
|