followthemoney 1.3.6__py3-none-any.whl → 3.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- followthemoney/__init__.py +5 -3
- followthemoney/cli/__init__.py +17 -0
- followthemoney/cli/aggregate.py +56 -0
- followthemoney/cli/cli.py +88 -0
- followthemoney/cli/exports.py +121 -0
- followthemoney/cli/mapping.py +85 -0
- followthemoney/cli/sieve.py +67 -0
- followthemoney/cli/util.py +142 -0
- followthemoney/compare.py +132 -55
- followthemoney/exc.py +19 -6
- followthemoney/export/common.py +29 -0
- followthemoney/export/csv.py +82 -0
- followthemoney/export/excel.py +75 -0
- followthemoney/export/graph.py +79 -0
- followthemoney/export/neo4j.py +182 -0
- followthemoney/export/rdf.py +26 -0
- followthemoney/graph.py +308 -0
- followthemoney/helpers.py +212 -0
- followthemoney/mapping/__init__.py +1 -1
- followthemoney/mapping/csv.py +67 -35
- followthemoney/mapping/entity.py +116 -44
- followthemoney/mapping/property.py +90 -44
- followthemoney/mapping/query.py +27 -19
- followthemoney/mapping/source.py +15 -5
- followthemoney/mapping/sql.py +75 -61
- followthemoney/messages.py +13 -7
- followthemoney/model.py +108 -56
- followthemoney/namespace.py +119 -0
- followthemoney/offshore.py +48 -0
- followthemoney/ontology.py +77 -0
- followthemoney/property.py +204 -71
- followthemoney/proxy.py +455 -118
- followthemoney/rdf.py +9 -0
- followthemoney/schema/Address.yaml +78 -0
- followthemoney/schema/Airplane.yaml +17 -10
- followthemoney/schema/Analyzable.yaml +54 -0
- followthemoney/schema/Article.yaml +16 -0
- followthemoney/schema/Assessment.yaml +32 -0
- followthemoney/schema/Asset.yaml +10 -4
- followthemoney/schema/Associate.yaml +41 -0
- followthemoney/schema/Audio.yaml +24 -0
- followthemoney/schema/BankAccount.yaml +53 -9
- followthemoney/schema/Call.yaml +48 -0
- followthemoney/schema/CallForTenders.yaml +117 -0
- followthemoney/schema/Company.yaml +37 -12
- followthemoney/schema/Contract.yaml +41 -7
- followthemoney/schema/ContractAward.yaml +30 -11
- followthemoney/schema/CourtCase.yaml +16 -10
- followthemoney/schema/CourtCaseParty.yaml +17 -6
- followthemoney/schema/CryptoWallet.yaml +48 -0
- followthemoney/schema/Debt.yaml +37 -0
- followthemoney/schema/Directorship.yaml +17 -4
- followthemoney/schema/Document.yaml +72 -139
- followthemoney/schema/Documentation.yml +38 -0
- followthemoney/schema/EconomicActivity.yaml +32 -17
- followthemoney/schema/Email.yaml +76 -0
- followthemoney/schema/Employment.yaml +39 -0
- followthemoney/schema/Event.yaml +35 -3
- followthemoney/schema/Family.yaml +41 -0
- followthemoney/schema/Folder.yaml +13 -0
- followthemoney/schema/HyperText.yaml +21 -0
- followthemoney/schema/Identification.yaml +40 -0
- followthemoney/schema/Image.yaml +25 -0
- followthemoney/schema/Interest.yaml +3 -6
- followthemoney/schema/Interval.yaml +56 -5
- followthemoney/schema/LegalEntity.yaml +81 -20
- followthemoney/schema/License.yaml +7 -3
- followthemoney/schema/Membership.yaml +19 -4
- followthemoney/schema/Mention.yaml +54 -0
- followthemoney/schema/Message.yaml +73 -0
- followthemoney/schema/Note.yaml +23 -0
- followthemoney/schema/Occupancy.yaml +40 -0
- followthemoney/schema/Organization.yaml +38 -3
- followthemoney/schema/Ownership.yaml +16 -4
- followthemoney/schema/Package.yaml +17 -0
- followthemoney/schema/Page.yaml +43 -0
- followthemoney/schema/Pages.yaml +23 -0
- followthemoney/schema/Passport.yaml +15 -17
- followthemoney/schema/Payment.yaml +38 -7
- followthemoney/schema/Person.yaml +61 -5
- followthemoney/schema/PlainText.yaml +17 -0
- followthemoney/schema/Position.yaml +50 -0
- followthemoney/schema/Post.yaml +42 -0
- followthemoney/schema/Project.yaml +27 -0
- followthemoney/schema/ProjectParticipant.yaml +36 -0
- followthemoney/schema/PublicBody.yaml +14 -3
- followthemoney/schema/RealEstate.yaml +19 -3
- followthemoney/schema/Representation.yaml +17 -6
- followthemoney/schema/Sanction.yaml +44 -20
- followthemoney/schema/Security.yaml +59 -0
- followthemoney/schema/Similar.yaml +37 -0
- followthemoney/schema/Succession.yaml +36 -0
- followthemoney/schema/Table.yaml +32 -0
- followthemoney/schema/TaxRoll.yaml +27 -9
- followthemoney/schema/Thing.yaml +69 -13
- followthemoney/schema/Trip.yaml +42 -0
- followthemoney/schema/UnknownLink.yaml +17 -6
- followthemoney/schema/UserAccount.yaml +44 -0
- followthemoney/schema/Value.yaml +5 -1
- followthemoney/schema/Vehicle.yaml +25 -8
- followthemoney/schema/Vessel.yaml +18 -10
- followthemoney/schema/Video.yaml +20 -0
- followthemoney/schema/Workbook.yaml +18 -0
- followthemoney/schema.py +406 -135
- followthemoney/translations/ar/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/ar/LC_MESSAGES/followthemoney.po +2900 -787
- followthemoney/translations/bs/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/bs/LC_MESSAGES/followthemoney.po +2108 -520
- followthemoney/translations/de/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/de/LC_MESSAGES/followthemoney.po +2902 -782
- followthemoney/translations/es/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/es/LC_MESSAGES/followthemoney.po +2893 -779
- followthemoney/translations/fr/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/fr/LC_MESSAGES/followthemoney.po +4362 -0
- followthemoney/translations/fr/followthemoney.po +3861 -0
- followthemoney/translations/messages.pot +3021 -725
- followthemoney/translations/nb/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/nb/LC_MESSAGES/followthemoney.po +3778 -0
- followthemoney/translations/nl/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/nl/LC_MESSAGES/followthemoney.po +3837 -0
- followthemoney/translations/pt_BR/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/pt_BR/LC_MESSAGES/followthemoney.po +3784 -0
- followthemoney/translations/ru/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/ru/LC_MESSAGES/followthemoney.po +2837 -539
- followthemoney/translations/ru/followthemoney.po +4221 -0
- followthemoney/translations/tr/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/tr/LC_MESSAGES/followthemoney.po +2073 -491
- followthemoney/types/__init__.py +35 -17
- followthemoney/types/address.py +41 -21
- followthemoney/types/checksum.py +25 -0
- followthemoney/types/common.py +233 -88
- followthemoney/types/country.py +89 -56
- followthemoney/types/date.py +59 -76
- followthemoney/types/email.py +66 -35
- followthemoney/types/entity.py +66 -13
- followthemoney/types/gender.py +66 -0
- followthemoney/types/iban.py +47 -28
- followthemoney/types/identifier.py +49 -22
- followthemoney/types/ip.py +35 -21
- followthemoney/types/json.py +58 -0
- followthemoney/types/language.py +124 -37
- followthemoney/types/mimetype.py +44 -0
- followthemoney/types/name.py +56 -12
- followthemoney/types/number.py +30 -0
- followthemoney/types/phone.py +92 -34
- followthemoney/types/registry.py +52 -0
- followthemoney/types/string.py +43 -0
- followthemoney/types/topic.py +94 -0
- followthemoney/types/url.py +39 -17
- followthemoney/util.py +139 -45
- followthemoney-3.8.0.dist-info/METADATA +153 -0
- followthemoney-3.8.0.dist-info/RECORD +157 -0
- {followthemoney-1.3.6.dist-info → followthemoney-3.8.0.dist-info}/WHEEL +1 -2
- followthemoney-3.8.0.dist-info/entry_points.txt +17 -0
- followthemoney-1.3.6.dist-info/LICENSE.txt → followthemoney-3.8.0.dist-info/licenses/LICENSE +1 -1
- followthemoney/link.py +0 -75
- followthemoney/schema/Associate.yml +0 -19
- followthemoney/schema/Family.yml +0 -19
- followthemoney/schema/Land.yml +0 -9
- followthemoney/schema/Relationship.yaml +0 -26
- followthemoney/types/domain.py +0 -50
- followthemoney-1.3.6.dist-info/DESCRIPTION.rst +0 -3
- followthemoney-1.3.6.dist-info/METADATA +0 -39
- followthemoney-1.3.6.dist-info/RECORD +0 -108
- followthemoney-1.3.6.dist-info/entry_points.txt +0 -3
- followthemoney-1.3.6.dist-info/metadata.json +0 -1
- followthemoney-1.3.6.dist-info/namespace_packages.txt +0 -1
- followthemoney-1.3.6.dist-info/top_level.txt +0 -3
- ns/ontology.py +0 -128
- tests/types/test_addresses.py +0 -24
- tests/types/test_common.py +0 -27
- tests/types/test_countries.py +0 -21
- tests/types/test_dates.py +0 -72
- tests/types/test_domains.py +0 -23
- tests/types/test_emails.py +0 -30
- tests/types/test_entity.py +0 -16
- tests/types/test_iban.py +0 -109
- tests/types/test_identifiers.py +0 -25
- tests/types/test_ip.py +0 -26
- tests/types/test_languages.py +0 -20
- tests/types/test_names.py +0 -33
- tests/types/test_phones.py +0 -24
- tests/types/test_registry.py +0 -14
- tests/types/test_urls.py +0 -23
- {ns → followthemoney/export}/__init__.py +0 -0
- /tests/types/__init__.py → /followthemoney/py.typed +0 -0
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from rdflib import Graph
|
|
3
|
+
from typing import List, Optional, TextIO
|
|
4
|
+
|
|
5
|
+
from followthemoney.export.common import Exporter
|
|
6
|
+
from followthemoney.proxy import E
|
|
7
|
+
|
|
8
|
+
log = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class RDFExporter(Exporter):
|
|
12
|
+
def __init__(self, fh: TextIO, qualified: bool = True) -> None:
|
|
13
|
+
super(RDFExporter, self).__init__()
|
|
14
|
+
self.fh = fh
|
|
15
|
+
self.qualified = qualified
|
|
16
|
+
|
|
17
|
+
def write(self, proxy: E, extra: Optional[List[str]] = None) -> None:
|
|
18
|
+
graph = Graph()
|
|
19
|
+
|
|
20
|
+
for triple in proxy.triples(qualified=self.qualified):
|
|
21
|
+
graph.add(triple)
|
|
22
|
+
try:
|
|
23
|
+
nt = graph.serialize(format="nt11").strip()
|
|
24
|
+
self.fh.write(nt + "\n")
|
|
25
|
+
except Exception:
|
|
26
|
+
log.exception("Failed to serialize ntriples.")
|
followthemoney/graph.py
ADDED
|
@@ -0,0 +1,308 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Converting FtM data to a property graph data model.
|
|
3
|
+
|
|
4
|
+
This module provides an abstract data object that represents a property
|
|
5
|
+
graph. This is used by the exporter modules to convert data
|
|
6
|
+
to a specific output format, like Cypher or NetworkX.
|
|
7
|
+
"""
|
|
8
|
+
import logging
|
|
9
|
+
from typing import Any, Dict, Generator, Iterable, List, Optional
|
|
10
|
+
|
|
11
|
+
from followthemoney.types import registry
|
|
12
|
+
from followthemoney.types.common import PropertyType
|
|
13
|
+
from followthemoney.schema import Schema
|
|
14
|
+
from followthemoney.proxy import EntityProxy
|
|
15
|
+
from followthemoney.property import Property
|
|
16
|
+
from followthemoney.exc import InvalidModel
|
|
17
|
+
|
|
18
|
+
log = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class Node(object):
|
|
22
|
+
"""A node represents either an entity that can be rendered as a
|
|
23
|
+
node in a graph, or as a re-ified value, like a name, email
|
|
24
|
+
address or phone number."""
|
|
25
|
+
|
|
26
|
+
__slots__ = ["type", "value", "id", "proxy", "schema"]
|
|
27
|
+
|
|
28
|
+
def __init__(
|
|
29
|
+
self,
|
|
30
|
+
type_: PropertyType,
|
|
31
|
+
value: str,
|
|
32
|
+
proxy: Optional[EntityProxy] = None,
|
|
33
|
+
schema: Optional[Schema] = None,
|
|
34
|
+
) -> None:
|
|
35
|
+
self.type = type_
|
|
36
|
+
self.value = value
|
|
37
|
+
# _id = type_.node_id_safe(value)
|
|
38
|
+
# if _id is None:
|
|
39
|
+
# raise InvalidData("No ID for node")
|
|
40
|
+
self.id = type_.node_id_safe(value)
|
|
41
|
+
self.proxy = proxy
|
|
42
|
+
self.schema = schema if proxy is None else proxy.schema
|
|
43
|
+
|
|
44
|
+
@property
|
|
45
|
+
def is_entity(self) -> bool:
|
|
46
|
+
"""Check to see if the node represents an entity. If this is false, the
|
|
47
|
+
node represents a non-entity property value that has been reified, like
|
|
48
|
+
a phone number or a name."""
|
|
49
|
+
return self.type == registry.entity
|
|
50
|
+
|
|
51
|
+
@property
|
|
52
|
+
def caption(self) -> str:
|
|
53
|
+
"""A user-facing label for the current node."""
|
|
54
|
+
if self.type == registry.entity and self.proxy is not None:
|
|
55
|
+
return self.proxy.caption
|
|
56
|
+
caption = self.type.caption(self.value)
|
|
57
|
+
return caption or self.value
|
|
58
|
+
|
|
59
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
60
|
+
"""Return a simple dictionary to reflect this graph node."""
|
|
61
|
+
return {
|
|
62
|
+
"id": self.id,
|
|
63
|
+
"type": self.type.name,
|
|
64
|
+
"value": self.value,
|
|
65
|
+
"caption": self.caption,
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
@classmethod
|
|
69
|
+
def from_proxy(cls, proxy: EntityProxy) -> "Node":
|
|
70
|
+
"""For a given :class:`~followthemoney.proxy.EntityProxy`, return a node
|
|
71
|
+
based on the entity."""
|
|
72
|
+
return cls(registry.entity, proxy.id, proxy=proxy)
|
|
73
|
+
|
|
74
|
+
def __str__(self) -> str:
|
|
75
|
+
return self.caption
|
|
76
|
+
|
|
77
|
+
def __repr__(self) -> str:
|
|
78
|
+
return "<Node(%r, %r, %r)>" % (self.id, self.type, self.caption)
|
|
79
|
+
|
|
80
|
+
def __hash__(self) -> int:
|
|
81
|
+
return hash(self.id)
|
|
82
|
+
|
|
83
|
+
def __eq__(self, other: Any) -> bool:
|
|
84
|
+
return bool(self.id == other.id)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
class Edge(object):
|
|
88
|
+
"""A link between two nodes."""
|
|
89
|
+
|
|
90
|
+
__slots__ = [
|
|
91
|
+
"id",
|
|
92
|
+
"weight",
|
|
93
|
+
"source_id",
|
|
94
|
+
"target_id",
|
|
95
|
+
"prop",
|
|
96
|
+
"proxy",
|
|
97
|
+
"schema",
|
|
98
|
+
"graph",
|
|
99
|
+
]
|
|
100
|
+
|
|
101
|
+
def __init__(
|
|
102
|
+
self,
|
|
103
|
+
graph: "Graph",
|
|
104
|
+
source: Node,
|
|
105
|
+
target: Node,
|
|
106
|
+
proxy: Optional[EntityProxy] = None,
|
|
107
|
+
prop: Optional[Property] = None,
|
|
108
|
+
value: Optional[str] = None,
|
|
109
|
+
):
|
|
110
|
+
self.graph = graph
|
|
111
|
+
self.id = f"{source.id}<>{target.id}"
|
|
112
|
+
self.source_id = source.id
|
|
113
|
+
self.target_id = target.id
|
|
114
|
+
self.weight = 1.0
|
|
115
|
+
self.prop = prop
|
|
116
|
+
self.proxy = proxy
|
|
117
|
+
self.schema: Optional[Schema] = None
|
|
118
|
+
if prop is not None and value is not None:
|
|
119
|
+
self.weight = prop.specificity(value)
|
|
120
|
+
if proxy is not None:
|
|
121
|
+
self.id = f"{source.id}<{proxy.id}>{target.id}"
|
|
122
|
+
self.schema = proxy.schema
|
|
123
|
+
|
|
124
|
+
@property
|
|
125
|
+
def source(self) -> Optional[Node]:
|
|
126
|
+
"""The graph node from which the edge originates."""
|
|
127
|
+
if self.source_id is None:
|
|
128
|
+
return None
|
|
129
|
+
return self.graph.nodes.get(self.source_id)
|
|
130
|
+
|
|
131
|
+
@property
|
|
132
|
+
def source_prop(self) -> Property:
|
|
133
|
+
"""Get the entity property originating this edge."""
|
|
134
|
+
if self.schema is not None and self.schema.source_prop is not None:
|
|
135
|
+
if self.schema.source_prop.reverse is not None:
|
|
136
|
+
return self.schema.source_prop.reverse
|
|
137
|
+
if self.prop is None:
|
|
138
|
+
raise InvalidModel("Contradiction: %r" % self)
|
|
139
|
+
return self.prop
|
|
140
|
+
|
|
141
|
+
@property
|
|
142
|
+
def target(self) -> Optional[Node]:
|
|
143
|
+
"""The graph node to which the edge points."""
|
|
144
|
+
if self.target_id is None:
|
|
145
|
+
return None
|
|
146
|
+
return self.graph.nodes.get(self.target_id)
|
|
147
|
+
|
|
148
|
+
@property
|
|
149
|
+
def target_prop(self) -> Optional[Property]:
|
|
150
|
+
"""Get the entity property originating this edge."""
|
|
151
|
+
if self.schema is not None and self.schema.target_prop is not None:
|
|
152
|
+
return self.schema.target_prop.reverse
|
|
153
|
+
if self.prop is not None:
|
|
154
|
+
return self.prop.reverse
|
|
155
|
+
# NOTE: this edge points at a value node.
|
|
156
|
+
return None
|
|
157
|
+
|
|
158
|
+
@property
|
|
159
|
+
def type_name(self) -> str:
|
|
160
|
+
"""Return a machine-readable description of the type of the edge.
|
|
161
|
+
This is either a property name or a schema name."""
|
|
162
|
+
if self.schema is not None:
|
|
163
|
+
return self.schema.name
|
|
164
|
+
if self.prop is None:
|
|
165
|
+
raise InvalidModel("Invalid edge: %r" % self)
|
|
166
|
+
return self.prop.name
|
|
167
|
+
|
|
168
|
+
def to_dict(self) -> Dict[str, Optional[str]]:
|
|
169
|
+
return {
|
|
170
|
+
"id": self.id,
|
|
171
|
+
"source_id": self.source_id,
|
|
172
|
+
"target_id": self.target_id,
|
|
173
|
+
"type_name": self.type_name,
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
def __repr__(self) -> str:
|
|
177
|
+
return "<Edge(%r)>" % self.id
|
|
178
|
+
|
|
179
|
+
def __hash__(self) -> int:
|
|
180
|
+
return hash(self.id)
|
|
181
|
+
|
|
182
|
+
def __eq__(self, other: Any) -> bool:
|
|
183
|
+
return bool(self.id == other.id)
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
class Graph(object):
|
|
187
|
+
"""A set of nodes and edges, derived from entities and their properties.
|
|
188
|
+
This represents an alternative interpretation of FtM data as a property
|
|
189
|
+
graph.
|
|
190
|
+
|
|
191
|
+
This class is meant to be extensible in order to support additional
|
|
192
|
+
backends, like Aleph.
|
|
193
|
+
"""
|
|
194
|
+
|
|
195
|
+
def __init__(self, edge_types: Iterable[PropertyType] = registry.pivots) -> None:
|
|
196
|
+
types = registry.get_types(edge_types)
|
|
197
|
+
self.edge_types = [t for t in types if t.matchable]
|
|
198
|
+
self.flush()
|
|
199
|
+
|
|
200
|
+
def flush(self) -> None:
|
|
201
|
+
"""Remove all nodes, edges and proxies from the graph."""
|
|
202
|
+
self.edges: Dict[str, Edge] = {}
|
|
203
|
+
self.nodes: Dict[str, Node] = {}
|
|
204
|
+
self.proxies: Dict[str, Optional[EntityProxy]] = {}
|
|
205
|
+
|
|
206
|
+
def queue(self, id_: str, proxy: Optional[EntityProxy] = None) -> None:
|
|
207
|
+
"""Register a reference to an entity in the graph."""
|
|
208
|
+
if id_ not in self.proxies or proxy is not None:
|
|
209
|
+
self.proxies[id_] = proxy
|
|
210
|
+
|
|
211
|
+
@property
|
|
212
|
+
def queued(self) -> List[str]:
|
|
213
|
+
"""Return a list of all the entities which are referenced from the graph
|
|
214
|
+
but that haven't been loaded yet. This can be used to get a list of
|
|
215
|
+
entities that should be included to expand the whole graph by one degree.
|
|
216
|
+
"""
|
|
217
|
+
return [i for (i, p) in self.proxies.items() if p is None]
|
|
218
|
+
|
|
219
|
+
def _get_node_stub(self, prop: Property, value: str) -> Node:
|
|
220
|
+
if prop.type == registry.entity:
|
|
221
|
+
self.queue(value)
|
|
222
|
+
node = Node(prop.type, value, schema=prop.range)
|
|
223
|
+
if node.id is None:
|
|
224
|
+
return node
|
|
225
|
+
if node.id not in self.nodes:
|
|
226
|
+
self.nodes[node.id] = node
|
|
227
|
+
return self.nodes[node.id]
|
|
228
|
+
|
|
229
|
+
def _add_edge(self, proxy: EntityProxy, source: str, target: str) -> None:
|
|
230
|
+
if proxy.schema.source_prop is None:
|
|
231
|
+
raise InvalidModel("Invalid edge entity: %r" % proxy)
|
|
232
|
+
source_node = self._get_node_stub(proxy.schema.source_prop, source)
|
|
233
|
+
if proxy.schema.target_prop is None:
|
|
234
|
+
raise InvalidModel("Invalid edge entity: %r" % proxy)
|
|
235
|
+
target_node = self._get_node_stub(proxy.schema.target_prop, target)
|
|
236
|
+
if source_node.id is not None and target_node.id is not None:
|
|
237
|
+
edge = Edge(self, source_node, target_node, proxy=proxy)
|
|
238
|
+
self.edges[edge.id] = edge
|
|
239
|
+
|
|
240
|
+
def _add_node(self, proxy: EntityProxy) -> None:
|
|
241
|
+
"""Derive a node and its value edges from the given proxy."""
|
|
242
|
+
entity = Node.from_proxy(proxy)
|
|
243
|
+
if entity.id is not None:
|
|
244
|
+
self.nodes[entity.id] = entity
|
|
245
|
+
for prop, value in proxy.itervalues():
|
|
246
|
+
if prop.type not in self.edge_types:
|
|
247
|
+
continue
|
|
248
|
+
node = self._get_node_stub(prop, value)
|
|
249
|
+
if node.id is None:
|
|
250
|
+
continue
|
|
251
|
+
edge = Edge(self, entity, node, prop=prop, value=value)
|
|
252
|
+
if edge.weight > 0:
|
|
253
|
+
self.edges[edge.id] = edge
|
|
254
|
+
|
|
255
|
+
def add(self, proxy: EntityProxy) -> None:
|
|
256
|
+
"""Add an :class:`~followthemoney.proxy.EntityProxy` to the graph and make
|
|
257
|
+
it either a :class:`~followthemoney.graph.Node` or an
|
|
258
|
+
:class:`~followthemoney.graph.Edge`."""
|
|
259
|
+
if proxy is None:
|
|
260
|
+
return
|
|
261
|
+
self.queue(proxy.id, proxy)
|
|
262
|
+
if proxy.schema.edge:
|
|
263
|
+
for (source, target) in proxy.edgepairs():
|
|
264
|
+
self._add_edge(proxy, source, target)
|
|
265
|
+
else:
|
|
266
|
+
self._add_node(proxy)
|
|
267
|
+
|
|
268
|
+
def iternodes(self) -> Iterable[Node]:
|
|
269
|
+
"""Iterate all :class:`nodes <followthemoney.graph.Node>` in the graph."""
|
|
270
|
+
return self.nodes.values()
|
|
271
|
+
|
|
272
|
+
def iteredges(self) -> Iterable[Edge]:
|
|
273
|
+
"""Iterate all :class:`edges <followthemoney.graph.Edge>` in the graph."""
|
|
274
|
+
return self.edges.values()
|
|
275
|
+
|
|
276
|
+
def get_outbound(
|
|
277
|
+
self, node: Node, prop: Optional[Property] = None
|
|
278
|
+
) -> Generator[Edge, None, None]:
|
|
279
|
+
"""Get all edges pointed out from the given node."""
|
|
280
|
+
for edge in self.iteredges():
|
|
281
|
+
if edge.source == node:
|
|
282
|
+
if prop and edge.source_prop != prop:
|
|
283
|
+
continue
|
|
284
|
+
yield edge
|
|
285
|
+
|
|
286
|
+
def get_inbound(
|
|
287
|
+
self, node: Node, prop: Optional[Property] = None
|
|
288
|
+
) -> Generator[Edge, None, None]:
|
|
289
|
+
"""Get all edges pointed at the given node."""
|
|
290
|
+
for edge in self.iteredges():
|
|
291
|
+
if edge.target == node:
|
|
292
|
+
if prop and edge.target_prop != prop:
|
|
293
|
+
continue
|
|
294
|
+
yield edge
|
|
295
|
+
|
|
296
|
+
def get_adjacent(
|
|
297
|
+
self, node: Node, prop: Optional[Property] = None
|
|
298
|
+
) -> Generator[Edge, None, None]:
|
|
299
|
+
"Get all adjacent edges of the given node."
|
|
300
|
+
yield from self.get_outbound(node, prop=prop)
|
|
301
|
+
yield from self.get_inbound(node, prop=prop)
|
|
302
|
+
|
|
303
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
304
|
+
"""Return a dictionary with the graph nodes and edges."""
|
|
305
|
+
return {
|
|
306
|
+
"nodes": [n.to_dict() for n in self.iternodes()],
|
|
307
|
+
"edges": [e.to_dict() for e in self.iteredges()],
|
|
308
|
+
}
|
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
# This module violates the boundary between the role of code and
|
|
2
|
+
# YAML in the rest of followthemoney. It handles normalisations
|
|
3
|
+
# which would be much harder to express in abstract, especially
|
|
4
|
+
# those thet simplify the data based on their pragmatics.
|
|
5
|
+
#
|
|
6
|
+
# If anyone were to swap out the default model, this would
|
|
7
|
+
# probably be the first place to break.
|
|
8
|
+
from os.path import splitext
|
|
9
|
+
from typing import Iterable, List, Optional, Set
|
|
10
|
+
from normality import safe_filename
|
|
11
|
+
from mimetypes import guess_extension
|
|
12
|
+
from itertools import product
|
|
13
|
+
from datetime import datetime, timedelta
|
|
14
|
+
|
|
15
|
+
from followthemoney.types import registry
|
|
16
|
+
from followthemoney.proxy import E
|
|
17
|
+
from followthemoney.util import join_text
|
|
18
|
+
|
|
19
|
+
PROV_MIN_DATES = ("createdAt", "authoredAt", "publishedAt")
|
|
20
|
+
PROV_MAX_DATES = ("modifiedAt", "retrievedAt")
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def remove_checksums(proxy: E) -> E:
|
|
24
|
+
"""When accepting entities via a web API, it would consistute
|
|
25
|
+
a security risk to allow a user to submit checksum-type properties.
|
|
26
|
+
These can be traded in for access to said files if they exist in the
|
|
27
|
+
underlying content-addressed storage. It seems safest to just remove
|
|
28
|
+
all checksums from entities when they are untrusted user input."""
|
|
29
|
+
for prop in proxy.iterprops():
|
|
30
|
+
if prop.type == registry.checksum:
|
|
31
|
+
proxy.pop(prop)
|
|
32
|
+
return proxy
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def simplify_provenance(proxy: E) -> E:
|
|
36
|
+
"""If there are multiple dates given for some of the provenance
|
|
37
|
+
fields, we can logically conclude which one is the most meaningful."""
|
|
38
|
+
for prop_name in PROV_MAX_DATES:
|
|
39
|
+
values = proxy.pop(prop_name, quiet=True)
|
|
40
|
+
if len(values):
|
|
41
|
+
proxy.set(prop_name, max(values), cleaned=True)
|
|
42
|
+
for prop_name in PROV_MIN_DATES:
|
|
43
|
+
values = proxy.pop(prop_name, quiet=True)
|
|
44
|
+
if len(values):
|
|
45
|
+
proxy.set(prop_name, min(values), cleaned=True)
|
|
46
|
+
return proxy
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def entity_filename(
|
|
50
|
+
proxy: E, base_name: Optional[str] = None, extension: Optional[str] = None
|
|
51
|
+
) -> Optional[str]:
|
|
52
|
+
"""Derive a safe filename for the given entity."""
|
|
53
|
+
if proxy.schema.is_a("Document"):
|
|
54
|
+
for extension_ in proxy.get("extension", quiet=True):
|
|
55
|
+
if extension is not None:
|
|
56
|
+
break
|
|
57
|
+
extension = extension_
|
|
58
|
+
for file_name in proxy.get("fileName", quiet=True):
|
|
59
|
+
base_name_, extension_ = splitext(file_name)
|
|
60
|
+
if base_name is None and len(base_name_):
|
|
61
|
+
base_name = base_name_
|
|
62
|
+
if extension is None and len(extension_):
|
|
63
|
+
extension = extension_
|
|
64
|
+
for mime_type in proxy.get("mimeType", quiet=True):
|
|
65
|
+
if extension is not None:
|
|
66
|
+
break
|
|
67
|
+
extension = guess_extension(mime_type)
|
|
68
|
+
base_name = base_name or proxy.id
|
|
69
|
+
return safe_filename(base_name, extension=extension)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def name_entity(entity: E) -> E:
|
|
73
|
+
"""If an entity has multiple names, pick the most central one
|
|
74
|
+
and set all the others as aliases. This is awkward given that
|
|
75
|
+
names are not special and may not always be the caption."""
|
|
76
|
+
if entity.schema.is_a("Thing"):
|
|
77
|
+
names = entity.get("name")
|
|
78
|
+
if len(names) > 1:
|
|
79
|
+
name = registry.name.pick(names)
|
|
80
|
+
if name in names:
|
|
81
|
+
names.remove(name)
|
|
82
|
+
entity.set("name", name)
|
|
83
|
+
entity.add("alias", names)
|
|
84
|
+
return entity
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def check_person_cutoff(
|
|
88
|
+
entity: E,
|
|
89
|
+
death_cutoff: datetime = datetime(2000, 1, 1),
|
|
90
|
+
birth_cutoff: Optional[datetime] = None,
|
|
91
|
+
) -> bool:
|
|
92
|
+
"""Check if a person has been dead long enough to not be relevant for
|
|
93
|
+
investigations any more."""
|
|
94
|
+
if not entity.schema.is_a("Person"):
|
|
95
|
+
return False
|
|
96
|
+
death_dates = entity.get("deathDate", quiet=True)
|
|
97
|
+
death_cutoff_ = death_cutoff.isoformat()
|
|
98
|
+
if len(death_dates) and max(death_dates) < death_cutoff_:
|
|
99
|
+
return True
|
|
100
|
+
birth_dates = entity.get("birthDate", quiet=True)
|
|
101
|
+
if birth_cutoff is None:
|
|
102
|
+
birth_cutoff = death_cutoff - timedelta(days=100 * 365)
|
|
103
|
+
birth_cutoff_ = birth_cutoff.isoformat()
|
|
104
|
+
if len(birth_dates) and min(birth_dates) < birth_cutoff_:
|
|
105
|
+
return True
|
|
106
|
+
return False
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def remove_prefix_dates(entity: E) -> E:
|
|
110
|
+
"""If an entity has multiple values for a date field, you may
|
|
111
|
+
want to remove all those that are prefixes of others. For example,
|
|
112
|
+
if a Person has both a birthDate of 1990 and of 1990-05-01, we'd
|
|
113
|
+
want to drop the mention of 1990."""
|
|
114
|
+
for prop in entity.iterprops():
|
|
115
|
+
if prop.type == registry.date:
|
|
116
|
+
values = remove_prefix_date_values(entity.get(prop))
|
|
117
|
+
entity.set(prop, values)
|
|
118
|
+
return entity
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def remove_prefix_date_values(values: Iterable[str]) -> List[str]:
|
|
122
|
+
"""See ``remove_prefix_dates``."""
|
|
123
|
+
kept: List[str] = []
|
|
124
|
+
values = sorted(values, key=len, reverse=True)
|
|
125
|
+
for index, value in enumerate(values):
|
|
126
|
+
keep = True
|
|
127
|
+
for longer in values[:index]:
|
|
128
|
+
if longer.startswith(value):
|
|
129
|
+
keep = False
|
|
130
|
+
break
|
|
131
|
+
if keep:
|
|
132
|
+
kept.append(value)
|
|
133
|
+
return kept
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def inline_names(entity: E, related: E) -> None:
|
|
137
|
+
"""Attempt to solve a weird UI problem. Imagine we are showing a list of
|
|
138
|
+
payments between a sender and a beneficiary to a user. They may now conduct
|
|
139
|
+
a search for a term present in the sender or recipient name, but there will
|
|
140
|
+
be no result, because the name is only indexed with the parties, but not in
|
|
141
|
+
the payment. This is part of a partial work-around to that.
|
|
142
|
+
|
|
143
|
+
This is really bad in theory, but really useful in practice. Shoot me.
|
|
144
|
+
"""
|
|
145
|
+
prop = entity.schema.get("namesMentioned")
|
|
146
|
+
if prop is not None:
|
|
147
|
+
entity.add(prop, related.get_type_values(registry.name))
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def combine_names(entity: E) -> E:
|
|
151
|
+
"""This function will try to build names from name parts provided as part
|
|
152
|
+
of a person entity. This is of course impossible to do culturally correctly
|
|
153
|
+
for the whole planet at once, so it should be mostly used for internal-facing
|
|
154
|
+
(e.g. matching) processes."""
|
|
155
|
+
if entity.schema.is_a("Person"):
|
|
156
|
+
first_names = entity.get("firstName")
|
|
157
|
+
second_names = entity.get("secondName")
|
|
158
|
+
second_names.append("")
|
|
159
|
+
middle_names = entity.get("middleName")
|
|
160
|
+
middle_names.append("")
|
|
161
|
+
father_names = entity.get("fatherName")
|
|
162
|
+
father_names.append("")
|
|
163
|
+
last_names = entity.get("lastName")
|
|
164
|
+
for (first, second, middle, father, last) in product(
|
|
165
|
+
first_names, second_names, middle_names, father_names, last_names
|
|
166
|
+
):
|
|
167
|
+
name = join_text(first, second, middle, father, last)
|
|
168
|
+
if name is not None:
|
|
169
|
+
entity.add("alias", name)
|
|
170
|
+
|
|
171
|
+
# If no first name is given, at least add the last name:
|
|
172
|
+
if not entity.get_type_values(registry.name) and len(last_names):
|
|
173
|
+
entity.add("alias", last_names)
|
|
174
|
+
return entity
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def dates_years(dates: Iterable[Optional[str]]) -> Set[str]:
|
|
178
|
+
"""Get the unique years from a set of date strings."""
|
|
179
|
+
cleaned: Set[str] = set()
|
|
180
|
+
for date in dates:
|
|
181
|
+
if date is not None:
|
|
182
|
+
cleaned.add(date[:4])
|
|
183
|
+
return cleaned
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def post_summary(
|
|
187
|
+
organization: str,
|
|
188
|
+
role: Optional[str],
|
|
189
|
+
start_dates: Iterable[Optional[str]],
|
|
190
|
+
end_dates: Iterable[Optional[str]],
|
|
191
|
+
dates: Iterable[Optional[str]],
|
|
192
|
+
) -> str:
|
|
193
|
+
"""Make a string summary for a Post object."""
|
|
194
|
+
position = organization
|
|
195
|
+
start = min(dates_years(start_dates), default="")
|
|
196
|
+
end = min(dates_years(end_dates), default="")
|
|
197
|
+
date_range = None
|
|
198
|
+
if len(start) or len(end):
|
|
199
|
+
date_range = f"{start}-{end}"
|
|
200
|
+
dates_ = dates_years(dates)
|
|
201
|
+
if date_range is None and len(dates_):
|
|
202
|
+
date_range = ", ".join(sorted(dates_))
|
|
203
|
+
|
|
204
|
+
bracketed = None
|
|
205
|
+
if date_range and role:
|
|
206
|
+
bracketed = f"{role}, {date_range}"
|
|
207
|
+
else:
|
|
208
|
+
bracketed = role or date_range
|
|
209
|
+
|
|
210
|
+
if bracketed:
|
|
211
|
+
position = f"{position} ({bracketed})"
|
|
212
|
+
return position
|
followthemoney/mapping/csv.py
CHANGED
|
@@ -1,65 +1,97 @@
|
|
|
1
1
|
import io
|
|
2
2
|
import os
|
|
3
3
|
import logging
|
|
4
|
+
from banal.lists import ensure_list
|
|
4
5
|
import requests
|
|
5
6
|
from csv import DictReader
|
|
6
|
-
from
|
|
7
|
-
from
|
|
7
|
+
from urllib.parse import urlparse
|
|
8
|
+
from banal import keys_values
|
|
9
|
+
from typing import (
|
|
10
|
+
TYPE_CHECKING,
|
|
11
|
+
Any,
|
|
12
|
+
Dict,
|
|
13
|
+
Generator,
|
|
14
|
+
ItemsView,
|
|
15
|
+
Iterable,
|
|
16
|
+
List,
|
|
17
|
+
Optional,
|
|
18
|
+
Set,
|
|
19
|
+
Tuple,
|
|
20
|
+
cast,
|
|
21
|
+
)
|
|
8
22
|
|
|
9
|
-
from followthemoney.mapping.source import Source
|
|
23
|
+
from followthemoney.mapping.source import Record, Source
|
|
24
|
+
from followthemoney.util import sanitize_text
|
|
10
25
|
from followthemoney.exc import InvalidMapping
|
|
11
26
|
|
|
27
|
+
if TYPE_CHECKING:
|
|
28
|
+
from followthemoney.mapping.query import QueryMapping
|
|
29
|
+
|
|
12
30
|
log = logging.getLogger(__name__)
|
|
31
|
+
FilterList = List[Tuple[str, Set[Optional[str]]]]
|
|
13
32
|
|
|
14
33
|
|
|
15
34
|
class CSVSource(Source):
|
|
16
35
|
"""Special case for entity loading directly from a CSV URL"""
|
|
17
36
|
|
|
18
|
-
def __init__(self, query, data):
|
|
19
|
-
super(
|
|
20
|
-
urls =
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
for url in urls:
|
|
24
|
-
self.urls.add(os.path.expandvars(url))
|
|
37
|
+
def __init__(self, query: "QueryMapping", data: Dict[str, Any]) -> None:
|
|
38
|
+
super().__init__(query, data)
|
|
39
|
+
self.urls: Set[str] = set()
|
|
40
|
+
for url in keys_values(data, "csv_url", "csv_urls"):
|
|
41
|
+
self.urls.add(cast(str, os.path.expandvars(url)))
|
|
25
42
|
|
|
26
43
|
if not len(self.urls):
|
|
27
44
|
raise InvalidMapping("No CSV URLs are specified.")
|
|
28
45
|
|
|
29
|
-
|
|
30
|
-
|
|
46
|
+
self.filters_set = self._parse_filters(self.filters)
|
|
47
|
+
self.filters_not_set = self._parse_filters(self.filters_not)
|
|
48
|
+
|
|
49
|
+
def _parse_filters(self, filters: ItemsView[str, Any]) -> FilterList:
|
|
50
|
+
filters_set: FilterList = []
|
|
51
|
+
for (key, value) in filters:
|
|
52
|
+
values = set(cast(List[Optional[str]], ensure_list(value)))
|
|
53
|
+
filters_set.append((key, values))
|
|
54
|
+
return filters_set
|
|
55
|
+
|
|
56
|
+
def check_filters(self, data: Record) -> bool:
|
|
57
|
+
for (k, v) in self.filters_set:
|
|
58
|
+
if data.get(k) not in v:
|
|
59
|
+
return False
|
|
60
|
+
for (k, v) in self.filters_not_set:
|
|
61
|
+
if data.get(k) in v:
|
|
62
|
+
return False
|
|
63
|
+
return True
|
|
64
|
+
|
|
65
|
+
@classmethod
|
|
66
|
+
def read_csv(cls, fh: Iterable[str]) -> Generator[Record, None, None]:
|
|
67
|
+
for row in DictReader(fh, skipinitialspace=True):
|
|
68
|
+
data: Record = {}
|
|
69
|
+
for ref, ref_value in row.items():
|
|
70
|
+
value = sanitize_text(ref_value)
|
|
71
|
+
if value is not None:
|
|
72
|
+
data[ref] = value
|
|
73
|
+
yield data
|
|
74
|
+
|
|
75
|
+
def read_csv_url(self, url: str) -> Generator[Record, None, None]:
|
|
76
|
+
parsed_url = urlparse(url)
|
|
31
77
|
log.info("Loading: %s", url)
|
|
32
|
-
if parsed_url.scheme in [
|
|
78
|
+
if parsed_url.scheme in ["http", "https"]:
|
|
33
79
|
res = requests.get(url, stream=True)
|
|
34
80
|
if not res.ok:
|
|
35
81
|
raise InvalidMapping("Failed to open CSV: %s" % url)
|
|
36
82
|
# if res.encoding is None:
|
|
37
|
-
res.encoding =
|
|
83
|
+
res.encoding = "utf-8"
|
|
38
84
|
# log.info("Detected encoding: %s", res.encoding)
|
|
39
85
|
lines = res.iter_lines(decode_unicode=True)
|
|
40
|
-
|
|
41
|
-
yield row
|
|
86
|
+
yield from self.read_csv(lines)
|
|
42
87
|
else:
|
|
43
|
-
with io.open(parsed_url.path,
|
|
44
|
-
|
|
45
|
-
yield row
|
|
46
|
-
|
|
47
|
-
def check_filters(self, data):
|
|
48
|
-
for (k, v) in self.filters:
|
|
49
|
-
if v != data.get(k):
|
|
50
|
-
return False
|
|
51
|
-
for (k, v) in self.filters_not:
|
|
52
|
-
if v == data.get(k):
|
|
53
|
-
return False
|
|
54
|
-
return True
|
|
88
|
+
with io.open(parsed_url.path, "r") as fh:
|
|
89
|
+
yield from self.read_csv(fh)
|
|
55
90
|
|
|
56
91
|
@property
|
|
57
|
-
def records(self):
|
|
92
|
+
def records(self) -> Generator[Record, None, None]:
|
|
58
93
|
"""Iterate through the table applying filters on-the-go."""
|
|
59
94
|
for url in self.urls:
|
|
60
|
-
for
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
data[ref] = stringify(row.get(ref))
|
|
64
|
-
if self.check_filters(data):
|
|
65
|
-
yield data
|
|
95
|
+
for record in self.read_csv_url(url):
|
|
96
|
+
if self.check_filters(record):
|
|
97
|
+
yield record
|