followthemoney 1.3.7__py3-none-any.whl → 3.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- followthemoney/__init__.py +5 -3
- followthemoney/cli/__init__.py +17 -0
- followthemoney/cli/aggregate.py +56 -0
- followthemoney/cli/cli.py +88 -0
- followthemoney/cli/exports.py +121 -0
- followthemoney/cli/mapping.py +85 -0
- followthemoney/cli/sieve.py +67 -0
- followthemoney/cli/util.py +142 -0
- followthemoney/compare.py +130 -60
- followthemoney/exc.py +19 -6
- followthemoney/export/common.py +29 -0
- followthemoney/export/csv.py +82 -0
- followthemoney/export/excel.py +75 -0
- followthemoney/export/graph.py +79 -0
- followthemoney/export/neo4j.py +182 -0
- followthemoney/export/rdf.py +26 -0
- followthemoney/graph.py +308 -0
- followthemoney/helpers.py +212 -0
- followthemoney/mapping/__init__.py +1 -1
- followthemoney/mapping/csv.py +67 -35
- followthemoney/mapping/entity.py +116 -44
- followthemoney/mapping/property.py +90 -44
- followthemoney/mapping/query.py +27 -19
- followthemoney/mapping/source.py +15 -5
- followthemoney/mapping/sql.py +75 -61
- followthemoney/messages.py +13 -7
- followthemoney/model.py +108 -56
- followthemoney/namespace.py +119 -0
- followthemoney/offshore.py +48 -0
- followthemoney/ontology.py +77 -0
- followthemoney/property.py +204 -71
- followthemoney/proxy.py +455 -118
- followthemoney/rdf.py +9 -0
- followthemoney/schema/Address.yaml +78 -0
- followthemoney/schema/Airplane.yaml +17 -10
- followthemoney/schema/Analyzable.yaml +54 -0
- followthemoney/schema/Article.yaml +16 -0
- followthemoney/schema/Assessment.yaml +32 -0
- followthemoney/schema/Asset.yaml +10 -4
- followthemoney/schema/Associate.yaml +41 -0
- followthemoney/schema/Audio.yaml +24 -0
- followthemoney/schema/BankAccount.yaml +53 -9
- followthemoney/schema/Call.yaml +48 -0
- followthemoney/schema/CallForTenders.yaml +117 -0
- followthemoney/schema/Company.yaml +37 -12
- followthemoney/schema/Contract.yaml +41 -7
- followthemoney/schema/ContractAward.yaml +30 -11
- followthemoney/schema/CourtCase.yaml +16 -10
- followthemoney/schema/CourtCaseParty.yaml +17 -6
- followthemoney/schema/CryptoWallet.yaml +48 -0
- followthemoney/schema/Debt.yaml +37 -0
- followthemoney/schema/Directorship.yaml +17 -4
- followthemoney/schema/Document.yaml +72 -139
- followthemoney/schema/Documentation.yml +38 -0
- followthemoney/schema/EconomicActivity.yaml +32 -17
- followthemoney/schema/Email.yaml +76 -0
- followthemoney/schema/Employment.yaml +39 -0
- followthemoney/schema/Event.yaml +35 -3
- followthemoney/schema/Family.yaml +41 -0
- followthemoney/schema/Folder.yaml +13 -0
- followthemoney/schema/HyperText.yaml +21 -0
- followthemoney/schema/Identification.yaml +40 -0
- followthemoney/schema/Image.yaml +25 -0
- followthemoney/schema/Interest.yaml +3 -6
- followthemoney/schema/Interval.yaml +56 -5
- followthemoney/schema/LegalEntity.yaml +81 -20
- followthemoney/schema/License.yaml +7 -3
- followthemoney/schema/Membership.yaml +19 -4
- followthemoney/schema/Mention.yaml +54 -0
- followthemoney/schema/Message.yaml +73 -0
- followthemoney/schema/Note.yaml +23 -0
- followthemoney/schema/Occupancy.yaml +40 -0
- followthemoney/schema/Organization.yaml +38 -3
- followthemoney/schema/Ownership.yaml +16 -4
- followthemoney/schema/Package.yaml +17 -0
- followthemoney/schema/Page.yaml +43 -0
- followthemoney/schema/Pages.yaml +23 -0
- followthemoney/schema/Passport.yaml +15 -17
- followthemoney/schema/Payment.yaml +38 -7
- followthemoney/schema/Person.yaml +61 -5
- followthemoney/schema/PlainText.yaml +17 -0
- followthemoney/schema/Position.yaml +50 -0
- followthemoney/schema/Post.yaml +42 -0
- followthemoney/schema/Project.yaml +27 -0
- followthemoney/schema/ProjectParticipant.yaml +36 -0
- followthemoney/schema/PublicBody.yaml +14 -3
- followthemoney/schema/RealEstate.yaml +19 -3
- followthemoney/schema/Representation.yaml +17 -6
- followthemoney/schema/Sanction.yaml +44 -20
- followthemoney/schema/Security.yaml +59 -0
- followthemoney/schema/Similar.yaml +37 -0
- followthemoney/schema/Succession.yaml +36 -0
- followthemoney/schema/Table.yaml +32 -0
- followthemoney/schema/TaxRoll.yaml +27 -9
- followthemoney/schema/Thing.yaml +69 -13
- followthemoney/schema/Trip.yaml +42 -0
- followthemoney/schema/UnknownLink.yaml +17 -6
- followthemoney/schema/UserAccount.yaml +44 -0
- followthemoney/schema/Value.yaml +5 -1
- followthemoney/schema/Vehicle.yaml +25 -8
- followthemoney/schema/Vessel.yaml +18 -10
- followthemoney/schema/Video.yaml +20 -0
- followthemoney/schema/Workbook.yaml +18 -0
- followthemoney/schema.py +406 -135
- followthemoney/translations/ar/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/ar/LC_MESSAGES/followthemoney.po +2900 -787
- followthemoney/translations/bs/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/bs/LC_MESSAGES/followthemoney.po +2108 -520
- followthemoney/translations/de/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/de/LC_MESSAGES/followthemoney.po +2902 -782
- followthemoney/translations/es/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/es/LC_MESSAGES/followthemoney.po +2893 -779
- followthemoney/translations/fr/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/fr/LC_MESSAGES/followthemoney.po +4362 -0
- followthemoney/translations/fr/followthemoney.po +3861 -0
- followthemoney/translations/messages.pot +3021 -725
- followthemoney/translations/nb/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/nb/LC_MESSAGES/followthemoney.po +3778 -0
- followthemoney/translations/nl/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/nl/LC_MESSAGES/followthemoney.po +3837 -0
- followthemoney/translations/pt_BR/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/pt_BR/LC_MESSAGES/followthemoney.po +3784 -0
- followthemoney/translations/ru/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/ru/LC_MESSAGES/followthemoney.po +2837 -539
- followthemoney/translations/ru/followthemoney.po +4221 -0
- followthemoney/translations/tr/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/tr/LC_MESSAGES/followthemoney.po +2073 -491
- followthemoney/types/__init__.py +35 -17
- followthemoney/types/address.py +41 -21
- followthemoney/types/checksum.py +25 -0
- followthemoney/types/common.py +233 -88
- followthemoney/types/country.py +89 -56
- followthemoney/types/date.py +59 -76
- followthemoney/types/email.py +66 -35
- followthemoney/types/entity.py +66 -13
- followthemoney/types/gender.py +66 -0
- followthemoney/types/iban.py +47 -28
- followthemoney/types/identifier.py +49 -22
- followthemoney/types/ip.py +35 -21
- followthemoney/types/json.py +58 -0
- followthemoney/types/language.py +124 -37
- followthemoney/types/mimetype.py +44 -0
- followthemoney/types/name.py +56 -12
- followthemoney/types/number.py +30 -0
- followthemoney/types/phone.py +92 -34
- followthemoney/types/registry.py +52 -0
- followthemoney/types/string.py +43 -0
- followthemoney/types/topic.py +94 -0
- followthemoney/types/url.py +39 -17
- followthemoney/util.py +139 -45
- followthemoney-3.8.0.dist-info/METADATA +153 -0
- followthemoney-3.8.0.dist-info/RECORD +157 -0
- {followthemoney-1.3.7.dist-info → followthemoney-3.8.0.dist-info}/WHEEL +1 -2
- followthemoney-3.8.0.dist-info/entry_points.txt +17 -0
- followthemoney-1.3.7.dist-info/LICENSE.txt → followthemoney-3.8.0.dist-info/licenses/LICENSE +1 -1
- followthemoney/link.py +0 -75
- followthemoney/schema/Associate.yml +0 -19
- followthemoney/schema/Family.yml +0 -19
- followthemoney/schema/Land.yml +0 -9
- followthemoney/schema/Relationship.yaml +0 -26
- followthemoney/types/domain.py +0 -50
- followthemoney-1.3.7.dist-info/DESCRIPTION.rst +0 -3
- followthemoney-1.3.7.dist-info/METADATA +0 -39
- followthemoney-1.3.7.dist-info/RECORD +0 -108
- followthemoney-1.3.7.dist-info/entry_points.txt +0 -3
- followthemoney-1.3.7.dist-info/metadata.json +0 -1
- followthemoney-1.3.7.dist-info/namespace_packages.txt +0 -1
- followthemoney-1.3.7.dist-info/top_level.txt +0 -3
- ns/ontology.py +0 -128
- tests/types/test_addresses.py +0 -24
- tests/types/test_common.py +0 -32
- tests/types/test_countries.py +0 -27
- tests/types/test_dates.py +0 -73
- tests/types/test_domains.py +0 -23
- tests/types/test_emails.py +0 -32
- tests/types/test_entity.py +0 -19
- tests/types/test_iban.py +0 -109
- tests/types/test_identifiers.py +0 -27
- tests/types/test_ip.py +0 -29
- tests/types/test_languages.py +0 -23
- tests/types/test_names.py +0 -33
- tests/types/test_phones.py +0 -24
- tests/types/test_registry.py +0 -14
- tests/types/test_urls.py +0 -23
- {ns → followthemoney/export}/__init__.py +0 -0
- /tests/types/__init__.py → /followthemoney/py.typed +0 -0
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
# This module violates the boundary between the role of code and
|
|
2
|
+
# YAML in the rest of followthemoney. It handles normalisations
|
|
3
|
+
# which would be much harder to express in abstract, especially
|
|
4
|
+
# those thet simplify the data based on their pragmatics.
|
|
5
|
+
#
|
|
6
|
+
# If anyone were to swap out the default model, this would
|
|
7
|
+
# probably be the first place to break.
|
|
8
|
+
from os.path import splitext
|
|
9
|
+
from typing import Iterable, List, Optional, Set
|
|
10
|
+
from normality import safe_filename
|
|
11
|
+
from mimetypes import guess_extension
|
|
12
|
+
from itertools import product
|
|
13
|
+
from datetime import datetime, timedelta
|
|
14
|
+
|
|
15
|
+
from followthemoney.types import registry
|
|
16
|
+
from followthemoney.proxy import E
|
|
17
|
+
from followthemoney.util import join_text
|
|
18
|
+
|
|
19
|
+
PROV_MIN_DATES = ("createdAt", "authoredAt", "publishedAt")
|
|
20
|
+
PROV_MAX_DATES = ("modifiedAt", "retrievedAt")
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def remove_checksums(proxy: E) -> E:
|
|
24
|
+
"""When accepting entities via a web API, it would consistute
|
|
25
|
+
a security risk to allow a user to submit checksum-type properties.
|
|
26
|
+
These can be traded in for access to said files if they exist in the
|
|
27
|
+
underlying content-addressed storage. It seems safest to just remove
|
|
28
|
+
all checksums from entities when they are untrusted user input."""
|
|
29
|
+
for prop in proxy.iterprops():
|
|
30
|
+
if prop.type == registry.checksum:
|
|
31
|
+
proxy.pop(prop)
|
|
32
|
+
return proxy
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def simplify_provenance(proxy: E) -> E:
|
|
36
|
+
"""If there are multiple dates given for some of the provenance
|
|
37
|
+
fields, we can logically conclude which one is the most meaningful."""
|
|
38
|
+
for prop_name in PROV_MAX_DATES:
|
|
39
|
+
values = proxy.pop(prop_name, quiet=True)
|
|
40
|
+
if len(values):
|
|
41
|
+
proxy.set(prop_name, max(values), cleaned=True)
|
|
42
|
+
for prop_name in PROV_MIN_DATES:
|
|
43
|
+
values = proxy.pop(prop_name, quiet=True)
|
|
44
|
+
if len(values):
|
|
45
|
+
proxy.set(prop_name, min(values), cleaned=True)
|
|
46
|
+
return proxy
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def entity_filename(
|
|
50
|
+
proxy: E, base_name: Optional[str] = None, extension: Optional[str] = None
|
|
51
|
+
) -> Optional[str]:
|
|
52
|
+
"""Derive a safe filename for the given entity."""
|
|
53
|
+
if proxy.schema.is_a("Document"):
|
|
54
|
+
for extension_ in proxy.get("extension", quiet=True):
|
|
55
|
+
if extension is not None:
|
|
56
|
+
break
|
|
57
|
+
extension = extension_
|
|
58
|
+
for file_name in proxy.get("fileName", quiet=True):
|
|
59
|
+
base_name_, extension_ = splitext(file_name)
|
|
60
|
+
if base_name is None and len(base_name_):
|
|
61
|
+
base_name = base_name_
|
|
62
|
+
if extension is None and len(extension_):
|
|
63
|
+
extension = extension_
|
|
64
|
+
for mime_type in proxy.get("mimeType", quiet=True):
|
|
65
|
+
if extension is not None:
|
|
66
|
+
break
|
|
67
|
+
extension = guess_extension(mime_type)
|
|
68
|
+
base_name = base_name or proxy.id
|
|
69
|
+
return safe_filename(base_name, extension=extension)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def name_entity(entity: E) -> E:
|
|
73
|
+
"""If an entity has multiple names, pick the most central one
|
|
74
|
+
and set all the others as aliases. This is awkward given that
|
|
75
|
+
names are not special and may not always be the caption."""
|
|
76
|
+
if entity.schema.is_a("Thing"):
|
|
77
|
+
names = entity.get("name")
|
|
78
|
+
if len(names) > 1:
|
|
79
|
+
name = registry.name.pick(names)
|
|
80
|
+
if name in names:
|
|
81
|
+
names.remove(name)
|
|
82
|
+
entity.set("name", name)
|
|
83
|
+
entity.add("alias", names)
|
|
84
|
+
return entity
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def check_person_cutoff(
|
|
88
|
+
entity: E,
|
|
89
|
+
death_cutoff: datetime = datetime(2000, 1, 1),
|
|
90
|
+
birth_cutoff: Optional[datetime] = None,
|
|
91
|
+
) -> bool:
|
|
92
|
+
"""Check if a person has been dead long enough to not be relevant for
|
|
93
|
+
investigations any more."""
|
|
94
|
+
if not entity.schema.is_a("Person"):
|
|
95
|
+
return False
|
|
96
|
+
death_dates = entity.get("deathDate", quiet=True)
|
|
97
|
+
death_cutoff_ = death_cutoff.isoformat()
|
|
98
|
+
if len(death_dates) and max(death_dates) < death_cutoff_:
|
|
99
|
+
return True
|
|
100
|
+
birth_dates = entity.get("birthDate", quiet=True)
|
|
101
|
+
if birth_cutoff is None:
|
|
102
|
+
birth_cutoff = death_cutoff - timedelta(days=100 * 365)
|
|
103
|
+
birth_cutoff_ = birth_cutoff.isoformat()
|
|
104
|
+
if len(birth_dates) and min(birth_dates) < birth_cutoff_:
|
|
105
|
+
return True
|
|
106
|
+
return False
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def remove_prefix_dates(entity: E) -> E:
|
|
110
|
+
"""If an entity has multiple values for a date field, you may
|
|
111
|
+
want to remove all those that are prefixes of others. For example,
|
|
112
|
+
if a Person has both a birthDate of 1990 and of 1990-05-01, we'd
|
|
113
|
+
want to drop the mention of 1990."""
|
|
114
|
+
for prop in entity.iterprops():
|
|
115
|
+
if prop.type == registry.date:
|
|
116
|
+
values = remove_prefix_date_values(entity.get(prop))
|
|
117
|
+
entity.set(prop, values)
|
|
118
|
+
return entity
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def remove_prefix_date_values(values: Iterable[str]) -> List[str]:
|
|
122
|
+
"""See ``remove_prefix_dates``."""
|
|
123
|
+
kept: List[str] = []
|
|
124
|
+
values = sorted(values, key=len, reverse=True)
|
|
125
|
+
for index, value in enumerate(values):
|
|
126
|
+
keep = True
|
|
127
|
+
for longer in values[:index]:
|
|
128
|
+
if longer.startswith(value):
|
|
129
|
+
keep = False
|
|
130
|
+
break
|
|
131
|
+
if keep:
|
|
132
|
+
kept.append(value)
|
|
133
|
+
return kept
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def inline_names(entity: E, related: E) -> None:
|
|
137
|
+
"""Attempt to solve a weird UI problem. Imagine we are showing a list of
|
|
138
|
+
payments between a sender and a beneficiary to a user. They may now conduct
|
|
139
|
+
a search for a term present in the sender or recipient name, but there will
|
|
140
|
+
be no result, because the name is only indexed with the parties, but not in
|
|
141
|
+
the payment. This is part of a partial work-around to that.
|
|
142
|
+
|
|
143
|
+
This is really bad in theory, but really useful in practice. Shoot me.
|
|
144
|
+
"""
|
|
145
|
+
prop = entity.schema.get("namesMentioned")
|
|
146
|
+
if prop is not None:
|
|
147
|
+
entity.add(prop, related.get_type_values(registry.name))
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def combine_names(entity: E) -> E:
|
|
151
|
+
"""This function will try to build names from name parts provided as part
|
|
152
|
+
of a person entity. This is of course impossible to do culturally correctly
|
|
153
|
+
for the whole planet at once, so it should be mostly used for internal-facing
|
|
154
|
+
(e.g. matching) processes."""
|
|
155
|
+
if entity.schema.is_a("Person"):
|
|
156
|
+
first_names = entity.get("firstName")
|
|
157
|
+
second_names = entity.get("secondName")
|
|
158
|
+
second_names.append("")
|
|
159
|
+
middle_names = entity.get("middleName")
|
|
160
|
+
middle_names.append("")
|
|
161
|
+
father_names = entity.get("fatherName")
|
|
162
|
+
father_names.append("")
|
|
163
|
+
last_names = entity.get("lastName")
|
|
164
|
+
for (first, second, middle, father, last) in product(
|
|
165
|
+
first_names, second_names, middle_names, father_names, last_names
|
|
166
|
+
):
|
|
167
|
+
name = join_text(first, second, middle, father, last)
|
|
168
|
+
if name is not None:
|
|
169
|
+
entity.add("alias", name)
|
|
170
|
+
|
|
171
|
+
# If no first name is given, at least add the last name:
|
|
172
|
+
if not entity.get_type_values(registry.name) and len(last_names):
|
|
173
|
+
entity.add("alias", last_names)
|
|
174
|
+
return entity
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def dates_years(dates: Iterable[Optional[str]]) -> Set[str]:
|
|
178
|
+
"""Get the unique years from a set of date strings."""
|
|
179
|
+
cleaned: Set[str] = set()
|
|
180
|
+
for date in dates:
|
|
181
|
+
if date is not None:
|
|
182
|
+
cleaned.add(date[:4])
|
|
183
|
+
return cleaned
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def post_summary(
|
|
187
|
+
organization: str,
|
|
188
|
+
role: Optional[str],
|
|
189
|
+
start_dates: Iterable[Optional[str]],
|
|
190
|
+
end_dates: Iterable[Optional[str]],
|
|
191
|
+
dates: Iterable[Optional[str]],
|
|
192
|
+
) -> str:
|
|
193
|
+
"""Make a string summary for a Post object."""
|
|
194
|
+
position = organization
|
|
195
|
+
start = min(dates_years(start_dates), default="")
|
|
196
|
+
end = min(dates_years(end_dates), default="")
|
|
197
|
+
date_range = None
|
|
198
|
+
if len(start) or len(end):
|
|
199
|
+
date_range = f"{start}-{end}"
|
|
200
|
+
dates_ = dates_years(dates)
|
|
201
|
+
if date_range is None and len(dates_):
|
|
202
|
+
date_range = ", ".join(sorted(dates_))
|
|
203
|
+
|
|
204
|
+
bracketed = None
|
|
205
|
+
if date_range and role:
|
|
206
|
+
bracketed = f"{role}, {date_range}"
|
|
207
|
+
else:
|
|
208
|
+
bracketed = role or date_range
|
|
209
|
+
|
|
210
|
+
if bracketed:
|
|
211
|
+
position = f"{position} ({bracketed})"
|
|
212
|
+
return position
|
followthemoney/mapping/csv.py
CHANGED
|
@@ -1,65 +1,97 @@
|
|
|
1
1
|
import io
|
|
2
2
|
import os
|
|
3
3
|
import logging
|
|
4
|
+
from banal.lists import ensure_list
|
|
4
5
|
import requests
|
|
5
6
|
from csv import DictReader
|
|
6
|
-
from
|
|
7
|
-
from
|
|
7
|
+
from urllib.parse import urlparse
|
|
8
|
+
from banal import keys_values
|
|
9
|
+
from typing import (
|
|
10
|
+
TYPE_CHECKING,
|
|
11
|
+
Any,
|
|
12
|
+
Dict,
|
|
13
|
+
Generator,
|
|
14
|
+
ItemsView,
|
|
15
|
+
Iterable,
|
|
16
|
+
List,
|
|
17
|
+
Optional,
|
|
18
|
+
Set,
|
|
19
|
+
Tuple,
|
|
20
|
+
cast,
|
|
21
|
+
)
|
|
8
22
|
|
|
9
|
-
from followthemoney.mapping.source import Source
|
|
23
|
+
from followthemoney.mapping.source import Record, Source
|
|
24
|
+
from followthemoney.util import sanitize_text
|
|
10
25
|
from followthemoney.exc import InvalidMapping
|
|
11
26
|
|
|
27
|
+
if TYPE_CHECKING:
|
|
28
|
+
from followthemoney.mapping.query import QueryMapping
|
|
29
|
+
|
|
12
30
|
log = logging.getLogger(__name__)
|
|
31
|
+
FilterList = List[Tuple[str, Set[Optional[str]]]]
|
|
13
32
|
|
|
14
33
|
|
|
15
34
|
class CSVSource(Source):
|
|
16
35
|
"""Special case for entity loading directly from a CSV URL"""
|
|
17
36
|
|
|
18
|
-
def __init__(self, query, data):
|
|
19
|
-
super(
|
|
20
|
-
urls =
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
for url in urls:
|
|
24
|
-
self.urls.add(os.path.expandvars(url))
|
|
37
|
+
def __init__(self, query: "QueryMapping", data: Dict[str, Any]) -> None:
|
|
38
|
+
super().__init__(query, data)
|
|
39
|
+
self.urls: Set[str] = set()
|
|
40
|
+
for url in keys_values(data, "csv_url", "csv_urls"):
|
|
41
|
+
self.urls.add(cast(str, os.path.expandvars(url)))
|
|
25
42
|
|
|
26
43
|
if not len(self.urls):
|
|
27
44
|
raise InvalidMapping("No CSV URLs are specified.")
|
|
28
45
|
|
|
29
|
-
|
|
30
|
-
|
|
46
|
+
self.filters_set = self._parse_filters(self.filters)
|
|
47
|
+
self.filters_not_set = self._parse_filters(self.filters_not)
|
|
48
|
+
|
|
49
|
+
def _parse_filters(self, filters: ItemsView[str, Any]) -> FilterList:
|
|
50
|
+
filters_set: FilterList = []
|
|
51
|
+
for (key, value) in filters:
|
|
52
|
+
values = set(cast(List[Optional[str]], ensure_list(value)))
|
|
53
|
+
filters_set.append((key, values))
|
|
54
|
+
return filters_set
|
|
55
|
+
|
|
56
|
+
def check_filters(self, data: Record) -> bool:
|
|
57
|
+
for (k, v) in self.filters_set:
|
|
58
|
+
if data.get(k) not in v:
|
|
59
|
+
return False
|
|
60
|
+
for (k, v) in self.filters_not_set:
|
|
61
|
+
if data.get(k) in v:
|
|
62
|
+
return False
|
|
63
|
+
return True
|
|
64
|
+
|
|
65
|
+
@classmethod
|
|
66
|
+
def read_csv(cls, fh: Iterable[str]) -> Generator[Record, None, None]:
|
|
67
|
+
for row in DictReader(fh, skipinitialspace=True):
|
|
68
|
+
data: Record = {}
|
|
69
|
+
for ref, ref_value in row.items():
|
|
70
|
+
value = sanitize_text(ref_value)
|
|
71
|
+
if value is not None:
|
|
72
|
+
data[ref] = value
|
|
73
|
+
yield data
|
|
74
|
+
|
|
75
|
+
def read_csv_url(self, url: str) -> Generator[Record, None, None]:
|
|
76
|
+
parsed_url = urlparse(url)
|
|
31
77
|
log.info("Loading: %s", url)
|
|
32
|
-
if parsed_url.scheme in [
|
|
78
|
+
if parsed_url.scheme in ["http", "https"]:
|
|
33
79
|
res = requests.get(url, stream=True)
|
|
34
80
|
if not res.ok:
|
|
35
81
|
raise InvalidMapping("Failed to open CSV: %s" % url)
|
|
36
82
|
# if res.encoding is None:
|
|
37
|
-
res.encoding =
|
|
83
|
+
res.encoding = "utf-8"
|
|
38
84
|
# log.info("Detected encoding: %s", res.encoding)
|
|
39
85
|
lines = res.iter_lines(decode_unicode=True)
|
|
40
|
-
|
|
41
|
-
yield row
|
|
86
|
+
yield from self.read_csv(lines)
|
|
42
87
|
else:
|
|
43
|
-
with io.open(parsed_url.path,
|
|
44
|
-
|
|
45
|
-
yield row
|
|
46
|
-
|
|
47
|
-
def check_filters(self, data):
|
|
48
|
-
for (k, v) in self.filters:
|
|
49
|
-
if v != data.get(k):
|
|
50
|
-
return False
|
|
51
|
-
for (k, v) in self.filters_not:
|
|
52
|
-
if v == data.get(k):
|
|
53
|
-
return False
|
|
54
|
-
return True
|
|
88
|
+
with io.open(parsed_url.path, "r") as fh:
|
|
89
|
+
yield from self.read_csv(fh)
|
|
55
90
|
|
|
56
91
|
@property
|
|
57
|
-
def records(self):
|
|
92
|
+
def records(self) -> Generator[Record, None, None]:
|
|
58
93
|
"""Iterate through the table applying filters on-the-go."""
|
|
59
94
|
for url in self.urls:
|
|
60
|
-
for
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
data[ref] = stringify(row.get(ref))
|
|
64
|
-
if self.check_filters(data):
|
|
65
|
-
yield data
|
|
95
|
+
for record in self.read_csv_url(url):
|
|
96
|
+
if self.check_filters(record):
|
|
97
|
+
yield record
|
followthemoney/mapping/entity.py
CHANGED
|
@@ -1,62 +1,111 @@
|
|
|
1
|
+
import logging
|
|
1
2
|
from hashlib import sha1
|
|
2
|
-
from
|
|
3
|
+
from warnings import warn
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set
|
|
5
|
+
from banal import keys_values
|
|
6
|
+
from normality import stringify
|
|
3
7
|
|
|
4
|
-
from followthemoney.mapping.property import PropertyMapping
|
|
5
8
|
from followthemoney.types import registry
|
|
6
9
|
from followthemoney.util import key_bytes
|
|
10
|
+
from followthemoney.proxy import EntityProxy
|
|
11
|
+
from followthemoney.mapping.property import PropertyMapping
|
|
12
|
+
from followthemoney.mapping.source import Record
|
|
7
13
|
from followthemoney.exc import InvalidMapping
|
|
8
14
|
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
from followthemoney.model import Model
|
|
17
|
+
from followthemoney.mapping.query import QueryMapping
|
|
18
|
+
|
|
19
|
+
log = logging.getLogger(__name__)
|
|
20
|
+
|
|
9
21
|
|
|
10
22
|
class EntityMapping(object):
|
|
23
|
+
__slots__ = (
|
|
24
|
+
"model",
|
|
25
|
+
"name",
|
|
26
|
+
"seed",
|
|
27
|
+
"keys",
|
|
28
|
+
"id_column",
|
|
29
|
+
"schema",
|
|
30
|
+
"refs",
|
|
31
|
+
"dependencies",
|
|
32
|
+
"properties",
|
|
33
|
+
)
|
|
11
34
|
|
|
12
|
-
def __init__(
|
|
35
|
+
def __init__(
|
|
36
|
+
self,
|
|
37
|
+
model: "Model",
|
|
38
|
+
query: "QueryMapping",
|
|
39
|
+
name: str,
|
|
40
|
+
data: Dict[str, Any],
|
|
41
|
+
key_prefix: Optional[str] = None,
|
|
42
|
+
) -> None:
|
|
13
43
|
self.model = model
|
|
14
44
|
self.name = name
|
|
15
|
-
self.data = data
|
|
16
45
|
|
|
17
46
|
self.seed = sha1(key_bytes(key_prefix))
|
|
18
|
-
self.seed.update(key_bytes(data.get(
|
|
47
|
+
self.seed.update(key_bytes(data.get("key_literal")))
|
|
19
48
|
|
|
20
|
-
self.keys =
|
|
21
|
-
self.
|
|
22
|
-
if not len(self.keys):
|
|
23
|
-
raise InvalidMapping("No keys: %r" % name)
|
|
49
|
+
self.keys = keys_values(data, "key", "keys")
|
|
50
|
+
self.id_column = stringify(data.get("id_column"))
|
|
51
|
+
if not len(self.keys) and self.id_column is None:
|
|
52
|
+
raise InvalidMapping("No keys or ID: %r" % name)
|
|
53
|
+
if len(self.keys) and self.id_column is not None:
|
|
54
|
+
msg = "Please use only keys or id_column, not both: %r" % name
|
|
55
|
+
raise InvalidMapping(msg)
|
|
24
56
|
|
|
25
|
-
|
|
26
|
-
if
|
|
27
|
-
raise InvalidMapping("
|
|
57
|
+
schema_name = stringify(data.get("schema"))
|
|
58
|
+
if schema_name is None:
|
|
59
|
+
raise InvalidMapping("No schema: %s" % name)
|
|
60
|
+
schema = model.get(schema_name)
|
|
61
|
+
if schema is None:
|
|
62
|
+
raise InvalidMapping("Invalid schema: %s" % schema_name)
|
|
63
|
+
if schema.deprecated:
|
|
64
|
+
warn(
|
|
65
|
+
"Mapping uses a deprecated schema: %r" % schema,
|
|
66
|
+
DeprecationWarning,
|
|
67
|
+
stacklevel=2,
|
|
68
|
+
)
|
|
69
|
+
self.schema = schema
|
|
28
70
|
|
|
29
71
|
self.refs = set(self.keys)
|
|
30
|
-
self.
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
72
|
+
if self.id_column:
|
|
73
|
+
self.refs.add(self.id_column)
|
|
74
|
+
self.dependencies: Set[str] = set()
|
|
75
|
+
self.properties: List[PropertyMapping] = []
|
|
76
|
+
for name, prop_mapping in data.get("properties", {}).items():
|
|
77
|
+
prop = self.schema.get(name)
|
|
78
|
+
if prop is None:
|
|
35
79
|
raise InvalidMapping("Invalid property: %s" % name)
|
|
36
|
-
|
|
37
|
-
self.properties.append(
|
|
38
|
-
self.refs.update(
|
|
39
|
-
if
|
|
40
|
-
self.dependencies.add(
|
|
80
|
+
mapping = PropertyMapping(query, prop_mapping, prop)
|
|
81
|
+
self.properties.append(mapping)
|
|
82
|
+
self.refs.update(mapping.refs)
|
|
83
|
+
if mapping.entity:
|
|
84
|
+
self.dependencies.add(mapping.entity)
|
|
41
85
|
|
|
42
|
-
def bind(self):
|
|
86
|
+
def bind(self) -> None:
|
|
43
87
|
for prop in self.properties:
|
|
44
88
|
prop.bind()
|
|
45
89
|
|
|
46
|
-
def compute_key(self, record):
|
|
90
|
+
def compute_key(self, record: Record) -> Optional[str]:
|
|
47
91
|
"""Generate a key for this entity, based on the given fields."""
|
|
92
|
+
if self.id_column is not None:
|
|
93
|
+
return record.get(self.id_column)
|
|
48
94
|
values = [key_bytes(record.get(k)) for k in self.keys]
|
|
49
95
|
digest = self.seed.copy()
|
|
96
|
+
has_value = False
|
|
50
97
|
for value in sorted(values):
|
|
51
|
-
|
|
52
|
-
|
|
98
|
+
if len(value):
|
|
99
|
+
has_value = True
|
|
100
|
+
digest.update(value)
|
|
101
|
+
if has_value:
|
|
53
102
|
return digest.hexdigest()
|
|
103
|
+
return None
|
|
54
104
|
|
|
55
|
-
def map(
|
|
105
|
+
def map(
|
|
106
|
+
self, record: Record, entities: Dict[str, EntityProxy]
|
|
107
|
+
) -> Optional[EntityProxy]:
|
|
56
108
|
proxy = self.model.make_entity(self.schema)
|
|
57
|
-
proxy.id = self.compute_key(record)
|
|
58
|
-
if proxy.id is None:
|
|
59
|
-
return
|
|
60
109
|
|
|
61
110
|
# THIS IS HACKY
|
|
62
111
|
# Some of the converters, e.g. for phone numbers, work better if they
|
|
@@ -64,23 +113,46 @@ class EntityMapping(object):
|
|
|
64
113
|
# detail, we are first running country fields, then making the data
|
|
65
114
|
# from that accessible to phone and address parsers.
|
|
66
115
|
for prop in self.properties:
|
|
67
|
-
if prop.
|
|
68
|
-
|
|
116
|
+
if prop.prop.type == registry.country:
|
|
117
|
+
discarded_values = prop.map(proxy, record, entities)
|
|
118
|
+
for value in discarded_values:
|
|
119
|
+
log.warning(
|
|
120
|
+
f'[{self.name}] Discarded unclean value "{value}" for property "{prop.prop.qname}".'
|
|
121
|
+
)
|
|
69
122
|
|
|
70
123
|
for prop in self.properties:
|
|
71
|
-
if prop.
|
|
72
|
-
|
|
73
|
-
|
|
124
|
+
if prop.prop.type != registry.country:
|
|
125
|
+
discarded_values = prop.map(proxy, record, entities)
|
|
126
|
+
for value in discarded_values:
|
|
127
|
+
log.warning(
|
|
128
|
+
f'[{self.name}] Discarding unclean value "{value}" for property "{prop.prop.qname}".'
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
# Generate the ID at the end to avoid self-reference checks on empty
|
|
132
|
+
# keys.
|
|
133
|
+
proxy.id = self.compute_key(record)
|
|
134
|
+
if proxy.id is None:
|
|
135
|
+
if self.id_column:
|
|
136
|
+
log.warning(
|
|
137
|
+
f'[{self.name}] Skipping entity because no ID could be computed. Make sure that there are no empty values in the "{self.id_column}" column.'
|
|
138
|
+
)
|
|
139
|
+
if self.keys:
|
|
140
|
+
log.warning(
|
|
141
|
+
f"[{self.name}] Skipping entity because no ID could be computed. Make sure that there are no empty values in key columns."
|
|
142
|
+
)
|
|
143
|
+
return None
|
|
74
144
|
|
|
75
145
|
for prop in self.properties:
|
|
76
|
-
if prop.required:
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
146
|
+
if prop.required and not proxy.has(prop.prop):
|
|
147
|
+
# This is a bit weird, it flags fields to be required in
|
|
148
|
+
# the mapping, not in the model. Basically it means: if
|
|
149
|
+
# this row of source data doesn't have that field, then do
|
|
150
|
+
# not map it again.
|
|
151
|
+
log.warning(
|
|
152
|
+
f'[{self.name}] Skipping entity because required property "{prop.prop.name}" is empty.'
|
|
153
|
+
)
|
|
154
|
+
return None
|
|
83
155
|
return proxy
|
|
84
156
|
|
|
85
|
-
def __repr__(self):
|
|
86
|
-
return
|
|
157
|
+
def __repr__(self) -> str:
|
|
158
|
+
return "<EntityMapping(%r)>" % self.name
|