feed-ursus 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- feed_ursus-1.0.0/LICENSE +29 -0
- feed_ursus-1.0.0/PKG-INFO +134 -0
- feed_ursus-1.0.0/README.md +113 -0
- feed_ursus-1.0.0/feed_ursus/__init__.py +0 -0
- feed_ursus-1.0.0/feed_ursus/date_parser.py +60 -0
- feed_ursus-1.0.0/feed_ursus/feed_ursus.py +536 -0
- feed_ursus-1.0.0/feed_ursus/mapper/__init__.py +0 -0
- feed_ursus-1.0.0/feed_ursus/mapper/dlp.py +452 -0
- feed_ursus-1.0.0/feed_ursus/mapper/fields/iiif_text_direction.yml +9 -0
- feed_ursus-1.0.0/feed_ursus/mapper/fields/iiif_viewing_hint.yml +11 -0
- feed_ursus-1.0.0/feed_ursus/mapper/fields/language.yml +1121 -0
- feed_ursus-1.0.0/feed_ursus/mapper/fields/license.yml +46 -0
- feed_ursus-1.0.0/feed_ursus/mapper/fields/resource_type.yml +23 -0
- feed_ursus-1.0.0/feed_ursus/mapper/fields/rights_statement.yml +12 -0
- feed_ursus-1.0.0/feed_ursus/mapper/sinai.py +282 -0
- feed_ursus-1.0.0/feed_ursus/year_parser.py +57 -0
- feed_ursus-1.0.0/pyproject.toml +35 -0
feed_ursus-1.0.0/LICENSE
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
BSD 3-Clause License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2019, UCLA Library
|
|
4
|
+
All rights reserved.
|
|
5
|
+
|
|
6
|
+
Redistribution and use in source and binary forms, with or without
|
|
7
|
+
modification, are permitted provided that the following conditions are met:
|
|
8
|
+
|
|
9
|
+
1. Redistributions of source code must retain the above copyright notice, this
|
|
10
|
+
list of conditions and the following disclaimer.
|
|
11
|
+
|
|
12
|
+
2. Redistributions in binary form must reproduce the above copyright notice,
|
|
13
|
+
this list of conditions and the following disclaimer in the documentation
|
|
14
|
+
and/or other materials provided with the distribution.
|
|
15
|
+
|
|
16
|
+
3. Neither the name of the copyright holder nor the names of its
|
|
17
|
+
contributors may be used to endorse or promote products derived from
|
|
18
|
+
this software without specific prior written permission.
|
|
19
|
+
|
|
20
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
21
|
+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
22
|
+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
23
|
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
|
24
|
+
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
25
|
+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
26
|
+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
27
|
+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
28
|
+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
29
|
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: feed-ursus
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Command line tool to feed Solr index for the UCLA Digital Library's frontend, Ursus (https://digital.library.ucla.edu/)
|
|
5
|
+
Author: Your Name
|
|
6
|
+
Author-email: you@example.com
|
|
7
|
+
Requires-Python: >=3.10,<4.0
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
12
|
+
Requires-Dist: click (>=8.1.3,<9.0.0)
|
|
13
|
+
Requires-Dist: pysolr (>=3.8,<4.0)
|
|
14
|
+
Requires-Dist: python-dateutil (>=2.8.2,<3.0.0)
|
|
15
|
+
Requires-Dist: pyyaml (>=6.0.1,<7.0.0)
|
|
16
|
+
Requires-Dist: requests (>=2.32.2,<3.0.0)
|
|
17
|
+
Requires-Dist: rich (>=13.4.1,<14.0.0)
|
|
18
|
+
Requires-Dist: setuptools (>=74.0.0,<75.0.0)
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
|
|
21
|
+
# feed_ursus
|
|
22
|
+
Script to process CSVs into an Ursus solr index.
|
|
23
|
+
|
|
24
|
+
## Using feed_ursus
|
|
25
|
+
|
|
26
|
+
For basic use, you can install feed_ursus as a systemwide command directly from github, without having to first clone the repository.
|
|
27
|
+
|
|
28
|
+
### Installation
|
|
29
|
+
|
|
30
|
+
We recommend installing with [pipx](https://pipx.pypa.io/). On MacOS, you can install pipx (and python!) with [homebrew](https://brew.sh):
|
|
31
|
+
|
|
32
|
+
```
|
|
33
|
+
brew install pipx pyenv
|
|
34
|
+
pipx ensurepath
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
Then:
|
|
38
|
+
|
|
39
|
+
```
|
|
40
|
+
pipx install git+https://github.com/uclalibrary/feed_ursus.git
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
Pipx will install feed_ursus in its own virtualenv, but make the command accessible from anywhere so you don't need to active the virtualenv yourself.
|
|
44
|
+
|
|
45
|
+
### Use
|
|
46
|
+
|
|
47
|
+
Convert a csv into a json document that follows the data model of an Ursus solr index:
|
|
48
|
+
|
|
49
|
+
```
|
|
50
|
+
feed_ursus [path/to/your.csv]
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
This repo includes a docker-compose.yml file that will run local instances of solr and ursus for use in testing this script. To use them, first install [docker](https://docs.docker.com/install/) and [docker compose](https://docs.docker.com/compose/install/). Then run:
|
|
54
|
+
|
|
55
|
+
```
|
|
56
|
+
docker-compose up --detach
|
|
57
|
+
docker-compose run web bundle exec rails db:setup
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
It might take a minute or so for solr to get up and running, at which point you should be able to see your new site at http://localhost:3000. Ursus will be empty, because you haven't loaded any data yet.
|
|
61
|
+
|
|
62
|
+
To load data from a csv:
|
|
63
|
+
|
|
64
|
+
```
|
|
65
|
+
feed_ursus --solr_url=http://localhost:8983/solr/californica --mapping=dlp load [path/to/your.csv]
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
### Mappers
|
|
69
|
+
|
|
70
|
+
Different metadata mappings are included for general Digital Library use (`--mapping=dlp`) and for the Sinai Manuscripts Digital Library (`--mapping=sinai`). Because this script was originally used for the Sinai Manuscripts project, the default value is `sinai` for backwards compatibility.
|
|
71
|
+
|
|
72
|
+
## Developing feed_ursus
|
|
73
|
+
|
|
74
|
+
### Installing
|
|
75
|
+
|
|
76
|
+
For development, clone the repository and use poetry to set up the virtualenv:
|
|
77
|
+
|
|
78
|
+
```
|
|
79
|
+
git clone git@github.com:UCLALibrary/feed_ursus.git
|
|
80
|
+
cd feed_ursus
|
|
81
|
+
pipx install poetry
|
|
82
|
+
poetry install
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
Then, to activate the virtualenv:
|
|
86
|
+
|
|
87
|
+
```
|
|
88
|
+
poetry shell
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
The following will assume the virtualenv is active. You could also run e.g. `poetry run feed_ursus [path/to/your.csv]`
|
|
92
|
+
|
|
93
|
+
### Using the development version
|
|
94
|
+
|
|
95
|
+
```
|
|
96
|
+
feed_ursus --solr_url http://localhost:8983/solr/californica load [path/to/your.csv]
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
### Running the tests
|
|
100
|
+
|
|
101
|
+
Tests are written for [pytest](https://docs.pytest.org/en/latest/):
|
|
102
|
+
|
|
103
|
+
```
|
|
104
|
+
pytest
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
### Running the formatter and linters:
|
|
108
|
+
|
|
109
|
+
black (formatter) will run in check mode in ci, so make sure you run it before committing:
|
|
110
|
+
```
|
|
111
|
+
black .
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
flake8 (linter) isn't currently running in ci, but should be put back in soon:
|
|
115
|
+
```
|
|
116
|
+
flake8
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
pylint (linter) isn't currently running in ci, but should be put back in soon:
|
|
120
|
+
```
|
|
121
|
+
pylint
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
mypy (static type checker) isn't currently running in ci, but should be put back in soon:
|
|
125
|
+
```
|
|
126
|
+
mypy
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
# Caveats
|
|
130
|
+
|
|
131
|
+
## IIIF Manifests
|
|
132
|
+
|
|
133
|
+
When importing a work, the script will always assume that a IIIF manifest exists at https://iiif.library.ucla.edu/[ark]/manifest, where [ark] is the URL-encoded Archival Resource Key of the work. This link should work, as long as a manifest has been pushed to that location by importing the work into [Fester](https://github.com/UCLALibrary/fester) or [Californica](https://github.com/UCLALibrary/californica). If you haven't done one of those, obviously, the link will fail and the image won't be visible, but metadata will import and be visible. A manifest can then be created and pushed to the expected location without re-running feed_ursus.py.
|
|
134
|
+
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
# feed_ursus
|
|
2
|
+
Script to process CSVs into an Ursus solr index.
|
|
3
|
+
|
|
4
|
+
## Using feed_ursus
|
|
5
|
+
|
|
6
|
+
For basic use, you can install feed_ursus as a systemwide command directly from github, without having to first clone the repository.
|
|
7
|
+
|
|
8
|
+
### Installation
|
|
9
|
+
|
|
10
|
+
We recommend installing with [pipx](https://pipx.pypa.io/). On MacOS, you can install pipx (and python!) with [homebrew](https://brew.sh):
|
|
11
|
+
|
|
12
|
+
```
|
|
13
|
+
brew install pipx pyenv
|
|
14
|
+
pipx ensurepath
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
Then:
|
|
18
|
+
|
|
19
|
+
```
|
|
20
|
+
pipx install git+https://github.com/uclalibrary/feed_ursus.git
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
Pipx will install feed_ursus in its own virtualenv, but make the command accessible from anywhere so you don't need to active the virtualenv yourself.
|
|
24
|
+
|
|
25
|
+
### Use
|
|
26
|
+
|
|
27
|
+
Convert a csv into a json document that follows the data model of an Ursus solr index:
|
|
28
|
+
|
|
29
|
+
```
|
|
30
|
+
feed_ursus [path/to/your.csv]
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
This repo includes a docker-compose.yml file that will run local instances of solr and ursus for use in testing this script. To use them, first install [docker](https://docs.docker.com/install/) and [docker compose](https://docs.docker.com/compose/install/). Then run:
|
|
34
|
+
|
|
35
|
+
```
|
|
36
|
+
docker-compose up --detach
|
|
37
|
+
docker-compose run web bundle exec rails db:setup
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
It might take a minute or so for solr to get up and running, at which point you should be able to see your new site at http://localhost:3000. Ursus will be empty, because you haven't loaded any data yet.
|
|
41
|
+
|
|
42
|
+
To load data from a csv:
|
|
43
|
+
|
|
44
|
+
```
|
|
45
|
+
feed_ursus --solr_url=http://localhost:8983/solr/californica --mapping=dlp load [path/to/your.csv]
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
### Mappers
|
|
49
|
+
|
|
50
|
+
Different metadata mappings are included for general Digital Library use (`--mapping=dlp`) and for the Sinai Manuscripts Digital Library (`--mapping=sinai`). Because this script was originally used for the Sinai Manuscripts project, the default value is `sinai` for backwards compatibility.
|
|
51
|
+
|
|
52
|
+
## Developing feed_ursus
|
|
53
|
+
|
|
54
|
+
### Installing
|
|
55
|
+
|
|
56
|
+
For development, clone the repository and use poetry to set up the virtualenv:
|
|
57
|
+
|
|
58
|
+
```
|
|
59
|
+
git clone git@github.com:UCLALibrary/feed_ursus.git
|
|
60
|
+
cd feed_ursus
|
|
61
|
+
pipx install poetry
|
|
62
|
+
poetry install
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
Then, to activate the virtualenv:
|
|
66
|
+
|
|
67
|
+
```
|
|
68
|
+
poetry shell
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
The following will assume the virtualenv is active. You could also run e.g. `poetry run feed_ursus [path/to/your.csv]`
|
|
72
|
+
|
|
73
|
+
### Using the development version
|
|
74
|
+
|
|
75
|
+
```
|
|
76
|
+
feed_ursus --solr_url http://localhost:8983/solr/californica load [path/to/your.csv]
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
### Running the tests
|
|
80
|
+
|
|
81
|
+
Tests are written for [pytest](https://docs.pytest.org/en/latest/):
|
|
82
|
+
|
|
83
|
+
```
|
|
84
|
+
pytest
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
### Running the formatter and linters:
|
|
88
|
+
|
|
89
|
+
black (formatter) will run in check mode in ci, so make sure you run it before committing:
|
|
90
|
+
```
|
|
91
|
+
black .
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
flake8 (linter) isn't currently running in ci, but should be put back in soon:
|
|
95
|
+
```
|
|
96
|
+
flake8
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
pylint (linter) isn't currently running in ci, but should be put back in soon:
|
|
100
|
+
```
|
|
101
|
+
pylint
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
mypy (static type checker) isn't currently running in ci, but should be put back in soon:
|
|
105
|
+
```
|
|
106
|
+
mypy
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
# Caveats
|
|
110
|
+
|
|
111
|
+
## IIIF Manifests
|
|
112
|
+
|
|
113
|
+
When importing a work, the script will always assume that a IIIF manifest exists at https://iiif.library.ucla.edu/[ark]/manifest, where [ark] is the URL-encoded Archival Resource Key of the work. This link should work, as long as a manifest has been pushed to that location by importing the work into [Fester](https://github.com/UCLALibrary/fester) or [Californica](https://github.com/UCLALibrary/californica). If you haven't done one of those, obviously, the link will fail and the image won't be visible, but metadata will import and be visible. A manifest can then be created and pushed to the expected location without re-running feed_ursus.py.
|
|
File without changes
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Creates a multi-valued 'year_isim' field by parsing input strings.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
import typing
|
|
7
|
+
from datetime import datetime
|
|
8
|
+
from dateutil import parser
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
NORMALIZED_RANGE = re.compile(r"(.*)/(.*)")
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def get_dates(normalized_dates: typing.Any):
|
|
15
|
+
"""Maps a list of 'normalized_date' strings to a sorted list of datetime.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
dates: A list of strings containing dates in the 'normalized_date' format.
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
A list of years extracted from "dates".
|
|
22
|
+
|
|
23
|
+
"""
|
|
24
|
+
if not isinstance(normalized_dates, typing.Iterable):
|
|
25
|
+
return []
|
|
26
|
+
solr_dts = set()
|
|
27
|
+
for normalized_date in normalized_dates:
|
|
28
|
+
if not isinstance(normalized_date, str):
|
|
29
|
+
continue
|
|
30
|
+
match = NORMALIZED_RANGE.search(normalized_date)
|
|
31
|
+
if match:
|
|
32
|
+
start_str, end_str = match.groups()
|
|
33
|
+
start = get_date(start_str)
|
|
34
|
+
end = get_date(end_str)
|
|
35
|
+
if start and end:
|
|
36
|
+
solr_dts.update({start, end})
|
|
37
|
+
else:
|
|
38
|
+
solr_date = get_date(normalized_date)
|
|
39
|
+
if solr_date:
|
|
40
|
+
solr_dts.add(solr_date)
|
|
41
|
+
return sorted(solr_dts)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def get_date(date: str):
|
|
45
|
+
"""Extracts the single 4-digit year found in the input date string.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
date: a string containing a date in 'normalized_date' format.
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
A single date.
|
|
52
|
+
|
|
53
|
+
"""
|
|
54
|
+
try:
|
|
55
|
+
parsed_date = parser.parse(date, default=datetime(1978, 1, 1))
|
|
56
|
+
return parsed_date
|
|
57
|
+
except ValueError as err:
|
|
58
|
+
print(err)
|
|
59
|
+
return None
|
|
60
|
+
return None
|