followthemoney 1.3.6__py3-none-any.whl → 3.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (186) hide show
  1. followthemoney/__init__.py +5 -3
  2. followthemoney/cli/__init__.py +17 -0
  3. followthemoney/cli/aggregate.py +56 -0
  4. followthemoney/cli/cli.py +88 -0
  5. followthemoney/cli/exports.py +121 -0
  6. followthemoney/cli/mapping.py +85 -0
  7. followthemoney/cli/sieve.py +67 -0
  8. followthemoney/cli/util.py +142 -0
  9. followthemoney/compare.py +132 -55
  10. followthemoney/exc.py +19 -6
  11. followthemoney/export/common.py +29 -0
  12. followthemoney/export/csv.py +82 -0
  13. followthemoney/export/excel.py +75 -0
  14. followthemoney/export/graph.py +79 -0
  15. followthemoney/export/neo4j.py +182 -0
  16. followthemoney/export/rdf.py +26 -0
  17. followthemoney/graph.py +308 -0
  18. followthemoney/helpers.py +212 -0
  19. followthemoney/mapping/__init__.py +1 -1
  20. followthemoney/mapping/csv.py +67 -35
  21. followthemoney/mapping/entity.py +116 -44
  22. followthemoney/mapping/property.py +90 -44
  23. followthemoney/mapping/query.py +27 -19
  24. followthemoney/mapping/source.py +15 -5
  25. followthemoney/mapping/sql.py +75 -61
  26. followthemoney/messages.py +13 -7
  27. followthemoney/model.py +108 -56
  28. followthemoney/namespace.py +119 -0
  29. followthemoney/offshore.py +48 -0
  30. followthemoney/ontology.py +77 -0
  31. followthemoney/property.py +204 -71
  32. followthemoney/proxy.py +455 -118
  33. followthemoney/rdf.py +9 -0
  34. followthemoney/schema/Address.yaml +78 -0
  35. followthemoney/schema/Airplane.yaml +17 -10
  36. followthemoney/schema/Analyzable.yaml +54 -0
  37. followthemoney/schema/Article.yaml +16 -0
  38. followthemoney/schema/Assessment.yaml +32 -0
  39. followthemoney/schema/Asset.yaml +10 -4
  40. followthemoney/schema/Associate.yaml +41 -0
  41. followthemoney/schema/Audio.yaml +24 -0
  42. followthemoney/schema/BankAccount.yaml +53 -9
  43. followthemoney/schema/Call.yaml +48 -0
  44. followthemoney/schema/CallForTenders.yaml +117 -0
  45. followthemoney/schema/Company.yaml +37 -12
  46. followthemoney/schema/Contract.yaml +41 -7
  47. followthemoney/schema/ContractAward.yaml +30 -11
  48. followthemoney/schema/CourtCase.yaml +16 -10
  49. followthemoney/schema/CourtCaseParty.yaml +17 -6
  50. followthemoney/schema/CryptoWallet.yaml +48 -0
  51. followthemoney/schema/Debt.yaml +37 -0
  52. followthemoney/schema/Directorship.yaml +17 -4
  53. followthemoney/schema/Document.yaml +72 -139
  54. followthemoney/schema/Documentation.yml +38 -0
  55. followthemoney/schema/EconomicActivity.yaml +32 -17
  56. followthemoney/schema/Email.yaml +76 -0
  57. followthemoney/schema/Employment.yaml +39 -0
  58. followthemoney/schema/Event.yaml +35 -3
  59. followthemoney/schema/Family.yaml +41 -0
  60. followthemoney/schema/Folder.yaml +13 -0
  61. followthemoney/schema/HyperText.yaml +21 -0
  62. followthemoney/schema/Identification.yaml +40 -0
  63. followthemoney/schema/Image.yaml +25 -0
  64. followthemoney/schema/Interest.yaml +3 -6
  65. followthemoney/schema/Interval.yaml +56 -5
  66. followthemoney/schema/LegalEntity.yaml +81 -20
  67. followthemoney/schema/License.yaml +7 -3
  68. followthemoney/schema/Membership.yaml +19 -4
  69. followthemoney/schema/Mention.yaml +54 -0
  70. followthemoney/schema/Message.yaml +73 -0
  71. followthemoney/schema/Note.yaml +23 -0
  72. followthemoney/schema/Occupancy.yaml +40 -0
  73. followthemoney/schema/Organization.yaml +38 -3
  74. followthemoney/schema/Ownership.yaml +16 -4
  75. followthemoney/schema/Package.yaml +17 -0
  76. followthemoney/schema/Page.yaml +43 -0
  77. followthemoney/schema/Pages.yaml +23 -0
  78. followthemoney/schema/Passport.yaml +15 -17
  79. followthemoney/schema/Payment.yaml +38 -7
  80. followthemoney/schema/Person.yaml +61 -5
  81. followthemoney/schema/PlainText.yaml +17 -0
  82. followthemoney/schema/Position.yaml +50 -0
  83. followthemoney/schema/Post.yaml +42 -0
  84. followthemoney/schema/Project.yaml +27 -0
  85. followthemoney/schema/ProjectParticipant.yaml +36 -0
  86. followthemoney/schema/PublicBody.yaml +14 -3
  87. followthemoney/schema/RealEstate.yaml +19 -3
  88. followthemoney/schema/Representation.yaml +17 -6
  89. followthemoney/schema/Sanction.yaml +44 -20
  90. followthemoney/schema/Security.yaml +59 -0
  91. followthemoney/schema/Similar.yaml +37 -0
  92. followthemoney/schema/Succession.yaml +36 -0
  93. followthemoney/schema/Table.yaml +32 -0
  94. followthemoney/schema/TaxRoll.yaml +27 -9
  95. followthemoney/schema/Thing.yaml +69 -13
  96. followthemoney/schema/Trip.yaml +42 -0
  97. followthemoney/schema/UnknownLink.yaml +17 -6
  98. followthemoney/schema/UserAccount.yaml +44 -0
  99. followthemoney/schema/Value.yaml +5 -1
  100. followthemoney/schema/Vehicle.yaml +25 -8
  101. followthemoney/schema/Vessel.yaml +18 -10
  102. followthemoney/schema/Video.yaml +20 -0
  103. followthemoney/schema/Workbook.yaml +18 -0
  104. followthemoney/schema.py +406 -135
  105. followthemoney/translations/ar/LC_MESSAGES/followthemoney.mo +0 -0
  106. followthemoney/translations/ar/LC_MESSAGES/followthemoney.po +2900 -787
  107. followthemoney/translations/bs/LC_MESSAGES/followthemoney.mo +0 -0
  108. followthemoney/translations/bs/LC_MESSAGES/followthemoney.po +2108 -520
  109. followthemoney/translations/de/LC_MESSAGES/followthemoney.mo +0 -0
  110. followthemoney/translations/de/LC_MESSAGES/followthemoney.po +2902 -782
  111. followthemoney/translations/es/LC_MESSAGES/followthemoney.mo +0 -0
  112. followthemoney/translations/es/LC_MESSAGES/followthemoney.po +2893 -779
  113. followthemoney/translations/fr/LC_MESSAGES/followthemoney.mo +0 -0
  114. followthemoney/translations/fr/LC_MESSAGES/followthemoney.po +4362 -0
  115. followthemoney/translations/fr/followthemoney.po +3861 -0
  116. followthemoney/translations/messages.pot +3021 -725
  117. followthemoney/translations/nb/LC_MESSAGES/followthemoney.mo +0 -0
  118. followthemoney/translations/nb/LC_MESSAGES/followthemoney.po +3778 -0
  119. followthemoney/translations/nl/LC_MESSAGES/followthemoney.mo +0 -0
  120. followthemoney/translations/nl/LC_MESSAGES/followthemoney.po +3837 -0
  121. followthemoney/translations/pt_BR/LC_MESSAGES/followthemoney.mo +0 -0
  122. followthemoney/translations/pt_BR/LC_MESSAGES/followthemoney.po +3784 -0
  123. followthemoney/translations/ru/LC_MESSAGES/followthemoney.mo +0 -0
  124. followthemoney/translations/ru/LC_MESSAGES/followthemoney.po +2837 -539
  125. followthemoney/translations/ru/followthemoney.po +4221 -0
  126. followthemoney/translations/tr/LC_MESSAGES/followthemoney.mo +0 -0
  127. followthemoney/translations/tr/LC_MESSAGES/followthemoney.po +2073 -491
  128. followthemoney/types/__init__.py +35 -17
  129. followthemoney/types/address.py +41 -21
  130. followthemoney/types/checksum.py +25 -0
  131. followthemoney/types/common.py +233 -88
  132. followthemoney/types/country.py +89 -56
  133. followthemoney/types/date.py +59 -76
  134. followthemoney/types/email.py +66 -35
  135. followthemoney/types/entity.py +66 -13
  136. followthemoney/types/gender.py +66 -0
  137. followthemoney/types/iban.py +47 -28
  138. followthemoney/types/identifier.py +49 -22
  139. followthemoney/types/ip.py +35 -21
  140. followthemoney/types/json.py +58 -0
  141. followthemoney/types/language.py +124 -37
  142. followthemoney/types/mimetype.py +44 -0
  143. followthemoney/types/name.py +56 -12
  144. followthemoney/types/number.py +30 -0
  145. followthemoney/types/phone.py +92 -34
  146. followthemoney/types/registry.py +52 -0
  147. followthemoney/types/string.py +43 -0
  148. followthemoney/types/topic.py +94 -0
  149. followthemoney/types/url.py +39 -17
  150. followthemoney/util.py +139 -45
  151. followthemoney-3.8.0.dist-info/METADATA +153 -0
  152. followthemoney-3.8.0.dist-info/RECORD +157 -0
  153. {followthemoney-1.3.6.dist-info → followthemoney-3.8.0.dist-info}/WHEEL +1 -2
  154. followthemoney-3.8.0.dist-info/entry_points.txt +17 -0
  155. followthemoney-1.3.6.dist-info/LICENSE.txt → followthemoney-3.8.0.dist-info/licenses/LICENSE +1 -1
  156. followthemoney/link.py +0 -75
  157. followthemoney/schema/Associate.yml +0 -19
  158. followthemoney/schema/Family.yml +0 -19
  159. followthemoney/schema/Land.yml +0 -9
  160. followthemoney/schema/Relationship.yaml +0 -26
  161. followthemoney/types/domain.py +0 -50
  162. followthemoney-1.3.6.dist-info/DESCRIPTION.rst +0 -3
  163. followthemoney-1.3.6.dist-info/METADATA +0 -39
  164. followthemoney-1.3.6.dist-info/RECORD +0 -108
  165. followthemoney-1.3.6.dist-info/entry_points.txt +0 -3
  166. followthemoney-1.3.6.dist-info/metadata.json +0 -1
  167. followthemoney-1.3.6.dist-info/namespace_packages.txt +0 -1
  168. followthemoney-1.3.6.dist-info/top_level.txt +0 -3
  169. ns/ontology.py +0 -128
  170. tests/types/test_addresses.py +0 -24
  171. tests/types/test_common.py +0 -27
  172. tests/types/test_countries.py +0 -21
  173. tests/types/test_dates.py +0 -72
  174. tests/types/test_domains.py +0 -23
  175. tests/types/test_emails.py +0 -30
  176. tests/types/test_entity.py +0 -16
  177. tests/types/test_iban.py +0 -109
  178. tests/types/test_identifiers.py +0 -25
  179. tests/types/test_ip.py +0 -26
  180. tests/types/test_languages.py +0 -20
  181. tests/types/test_names.py +0 -33
  182. tests/types/test_phones.py +0 -24
  183. tests/types/test_registry.py +0 -14
  184. tests/types/test_urls.py +0 -23
  185. {ns → followthemoney/export}/__init__.py +0 -0
  186. /tests/types/__init__.py → /followthemoney/py.typed +0 -0
@@ -0,0 +1,26 @@
1
+ import logging
2
+ from rdflib import Graph
3
+ from typing import List, Optional, TextIO
4
+
5
+ from followthemoney.export.common import Exporter
6
+ from followthemoney.proxy import E
7
+
8
+ log = logging.getLogger(__name__)
9
+
10
+
11
+ class RDFExporter(Exporter):
12
+ def __init__(self, fh: TextIO, qualified: bool = True) -> None:
13
+ super(RDFExporter, self).__init__()
14
+ self.fh = fh
15
+ self.qualified = qualified
16
+
17
+ def write(self, proxy: E, extra: Optional[List[str]] = None) -> None:
18
+ graph = Graph()
19
+
20
+ for triple in proxy.triples(qualified=self.qualified):
21
+ graph.add(triple)
22
+ try:
23
+ nt = graph.serialize(format="nt11").strip()
24
+ self.fh.write(nt + "\n")
25
+ except Exception:
26
+ log.exception("Failed to serialize ntriples.")
@@ -0,0 +1,308 @@
1
+ """
2
+ Converting FtM data to a property graph data model.
3
+
4
+ This module provides an abstract data object that represents a property
5
+ graph. This is used by the exporter modules to convert data
6
+ to a specific output format, like Cypher or NetworkX.
7
+ """
8
+ import logging
9
+ from typing import Any, Dict, Generator, Iterable, List, Optional
10
+
11
+ from followthemoney.types import registry
12
+ from followthemoney.types.common import PropertyType
13
+ from followthemoney.schema import Schema
14
+ from followthemoney.proxy import EntityProxy
15
+ from followthemoney.property import Property
16
+ from followthemoney.exc import InvalidModel
17
+
18
+ log = logging.getLogger(__name__)
19
+
20
+
21
+ class Node(object):
22
+ """A node represents either an entity that can be rendered as a
23
+ node in a graph, or as a re-ified value, like a name, email
24
+ address or phone number."""
25
+
26
+ __slots__ = ["type", "value", "id", "proxy", "schema"]
27
+
28
+ def __init__(
29
+ self,
30
+ type_: PropertyType,
31
+ value: str,
32
+ proxy: Optional[EntityProxy] = None,
33
+ schema: Optional[Schema] = None,
34
+ ) -> None:
35
+ self.type = type_
36
+ self.value = value
37
+ # _id = type_.node_id_safe(value)
38
+ # if _id is None:
39
+ # raise InvalidData("No ID for node")
40
+ self.id = type_.node_id_safe(value)
41
+ self.proxy = proxy
42
+ self.schema = schema if proxy is None else proxy.schema
43
+
44
+ @property
45
+ def is_entity(self) -> bool:
46
+ """Check to see if the node represents an entity. If this is false, the
47
+ node represents a non-entity property value that has been reified, like
48
+ a phone number or a name."""
49
+ return self.type == registry.entity
50
+
51
+ @property
52
+ def caption(self) -> str:
53
+ """A user-facing label for the current node."""
54
+ if self.type == registry.entity and self.proxy is not None:
55
+ return self.proxy.caption
56
+ caption = self.type.caption(self.value)
57
+ return caption or self.value
58
+
59
+ def to_dict(self) -> Dict[str, Any]:
60
+ """Return a simple dictionary to reflect this graph node."""
61
+ return {
62
+ "id": self.id,
63
+ "type": self.type.name,
64
+ "value": self.value,
65
+ "caption": self.caption,
66
+ }
67
+
68
+ @classmethod
69
+ def from_proxy(cls, proxy: EntityProxy) -> "Node":
70
+ """For a given :class:`~followthemoney.proxy.EntityProxy`, return a node
71
+ based on the entity."""
72
+ return cls(registry.entity, proxy.id, proxy=proxy)
73
+
74
+ def __str__(self) -> str:
75
+ return self.caption
76
+
77
+ def __repr__(self) -> str:
78
+ return "<Node(%r, %r, %r)>" % (self.id, self.type, self.caption)
79
+
80
+ def __hash__(self) -> int:
81
+ return hash(self.id)
82
+
83
+ def __eq__(self, other: Any) -> bool:
84
+ return bool(self.id == other.id)
85
+
86
+
87
+ class Edge(object):
88
+ """A link between two nodes."""
89
+
90
+ __slots__ = [
91
+ "id",
92
+ "weight",
93
+ "source_id",
94
+ "target_id",
95
+ "prop",
96
+ "proxy",
97
+ "schema",
98
+ "graph",
99
+ ]
100
+
101
+ def __init__(
102
+ self,
103
+ graph: "Graph",
104
+ source: Node,
105
+ target: Node,
106
+ proxy: Optional[EntityProxy] = None,
107
+ prop: Optional[Property] = None,
108
+ value: Optional[str] = None,
109
+ ):
110
+ self.graph = graph
111
+ self.id = f"{source.id}<>{target.id}"
112
+ self.source_id = source.id
113
+ self.target_id = target.id
114
+ self.weight = 1.0
115
+ self.prop = prop
116
+ self.proxy = proxy
117
+ self.schema: Optional[Schema] = None
118
+ if prop is not None and value is not None:
119
+ self.weight = prop.specificity(value)
120
+ if proxy is not None:
121
+ self.id = f"{source.id}<{proxy.id}>{target.id}"
122
+ self.schema = proxy.schema
123
+
124
+ @property
125
+ def source(self) -> Optional[Node]:
126
+ """The graph node from which the edge originates."""
127
+ if self.source_id is None:
128
+ return None
129
+ return self.graph.nodes.get(self.source_id)
130
+
131
+ @property
132
+ def source_prop(self) -> Property:
133
+ """Get the entity property originating this edge."""
134
+ if self.schema is not None and self.schema.source_prop is not None:
135
+ if self.schema.source_prop.reverse is not None:
136
+ return self.schema.source_prop.reverse
137
+ if self.prop is None:
138
+ raise InvalidModel("Contradiction: %r" % self)
139
+ return self.prop
140
+
141
+ @property
142
+ def target(self) -> Optional[Node]:
143
+ """The graph node to which the edge points."""
144
+ if self.target_id is None:
145
+ return None
146
+ return self.graph.nodes.get(self.target_id)
147
+
148
+ @property
149
+ def target_prop(self) -> Optional[Property]:
150
+ """Get the entity property originating this edge."""
151
+ if self.schema is not None and self.schema.target_prop is not None:
152
+ return self.schema.target_prop.reverse
153
+ if self.prop is not None:
154
+ return self.prop.reverse
155
+ # NOTE: this edge points at a value node.
156
+ return None
157
+
158
+ @property
159
+ def type_name(self) -> str:
160
+ """Return a machine-readable description of the type of the edge.
161
+ This is either a property name or a schema name."""
162
+ if self.schema is not None:
163
+ return self.schema.name
164
+ if self.prop is None:
165
+ raise InvalidModel("Invalid edge: %r" % self)
166
+ return self.prop.name
167
+
168
+ def to_dict(self) -> Dict[str, Optional[str]]:
169
+ return {
170
+ "id": self.id,
171
+ "source_id": self.source_id,
172
+ "target_id": self.target_id,
173
+ "type_name": self.type_name,
174
+ }
175
+
176
+ def __repr__(self) -> str:
177
+ return "<Edge(%r)>" % self.id
178
+
179
+ def __hash__(self) -> int:
180
+ return hash(self.id)
181
+
182
+ def __eq__(self, other: Any) -> bool:
183
+ return bool(self.id == other.id)
184
+
185
+
186
+ class Graph(object):
187
+ """A set of nodes and edges, derived from entities and their properties.
188
+ This represents an alternative interpretation of FtM data as a property
189
+ graph.
190
+
191
+ This class is meant to be extensible in order to support additional
192
+ backends, like Aleph.
193
+ """
194
+
195
+ def __init__(self, edge_types: Iterable[PropertyType] = registry.pivots) -> None:
196
+ types = registry.get_types(edge_types)
197
+ self.edge_types = [t for t in types if t.matchable]
198
+ self.flush()
199
+
200
+ def flush(self) -> None:
201
+ """Remove all nodes, edges and proxies from the graph."""
202
+ self.edges: Dict[str, Edge] = {}
203
+ self.nodes: Dict[str, Node] = {}
204
+ self.proxies: Dict[str, Optional[EntityProxy]] = {}
205
+
206
+ def queue(self, id_: str, proxy: Optional[EntityProxy] = None) -> None:
207
+ """Register a reference to an entity in the graph."""
208
+ if id_ not in self.proxies or proxy is not None:
209
+ self.proxies[id_] = proxy
210
+
211
+ @property
212
+ def queued(self) -> List[str]:
213
+ """Return a list of all the entities which are referenced from the graph
214
+ but that haven't been loaded yet. This can be used to get a list of
215
+ entities that should be included to expand the whole graph by one degree.
216
+ """
217
+ return [i for (i, p) in self.proxies.items() if p is None]
218
+
219
+ def _get_node_stub(self, prop: Property, value: str) -> Node:
220
+ if prop.type == registry.entity:
221
+ self.queue(value)
222
+ node = Node(prop.type, value, schema=prop.range)
223
+ if node.id is None:
224
+ return node
225
+ if node.id not in self.nodes:
226
+ self.nodes[node.id] = node
227
+ return self.nodes[node.id]
228
+
229
+ def _add_edge(self, proxy: EntityProxy, source: str, target: str) -> None:
230
+ if proxy.schema.source_prop is None:
231
+ raise InvalidModel("Invalid edge entity: %r" % proxy)
232
+ source_node = self._get_node_stub(proxy.schema.source_prop, source)
233
+ if proxy.schema.target_prop is None:
234
+ raise InvalidModel("Invalid edge entity: %r" % proxy)
235
+ target_node = self._get_node_stub(proxy.schema.target_prop, target)
236
+ if source_node.id is not None and target_node.id is not None:
237
+ edge = Edge(self, source_node, target_node, proxy=proxy)
238
+ self.edges[edge.id] = edge
239
+
240
+ def _add_node(self, proxy: EntityProxy) -> None:
241
+ """Derive a node and its value edges from the given proxy."""
242
+ entity = Node.from_proxy(proxy)
243
+ if entity.id is not None:
244
+ self.nodes[entity.id] = entity
245
+ for prop, value in proxy.itervalues():
246
+ if prop.type not in self.edge_types:
247
+ continue
248
+ node = self._get_node_stub(prop, value)
249
+ if node.id is None:
250
+ continue
251
+ edge = Edge(self, entity, node, prop=prop, value=value)
252
+ if edge.weight > 0:
253
+ self.edges[edge.id] = edge
254
+
255
+ def add(self, proxy: EntityProxy) -> None:
256
+ """Add an :class:`~followthemoney.proxy.EntityProxy` to the graph and make
257
+ it either a :class:`~followthemoney.graph.Node` or an
258
+ :class:`~followthemoney.graph.Edge`."""
259
+ if proxy is None:
260
+ return
261
+ self.queue(proxy.id, proxy)
262
+ if proxy.schema.edge:
263
+ for (source, target) in proxy.edgepairs():
264
+ self._add_edge(proxy, source, target)
265
+ else:
266
+ self._add_node(proxy)
267
+
268
+ def iternodes(self) -> Iterable[Node]:
269
+ """Iterate all :class:`nodes <followthemoney.graph.Node>` in the graph."""
270
+ return self.nodes.values()
271
+
272
+ def iteredges(self) -> Iterable[Edge]:
273
+ """Iterate all :class:`edges <followthemoney.graph.Edge>` in the graph."""
274
+ return self.edges.values()
275
+
276
+ def get_outbound(
277
+ self, node: Node, prop: Optional[Property] = None
278
+ ) -> Generator[Edge, None, None]:
279
+ """Get all edges pointed out from the given node."""
280
+ for edge in self.iteredges():
281
+ if edge.source == node:
282
+ if prop and edge.source_prop != prop:
283
+ continue
284
+ yield edge
285
+
286
+ def get_inbound(
287
+ self, node: Node, prop: Optional[Property] = None
288
+ ) -> Generator[Edge, None, None]:
289
+ """Get all edges pointed at the given node."""
290
+ for edge in self.iteredges():
291
+ if edge.target == node:
292
+ if prop and edge.target_prop != prop:
293
+ continue
294
+ yield edge
295
+
296
+ def get_adjacent(
297
+ self, node: Node, prop: Optional[Property] = None
298
+ ) -> Generator[Edge, None, None]:
299
+ "Get all adjacent edges of the given node."
300
+ yield from self.get_outbound(node, prop=prop)
301
+ yield from self.get_inbound(node, prop=prop)
302
+
303
+ def to_dict(self) -> Dict[str, Any]:
304
+ """Return a dictionary with the graph nodes and edges."""
305
+ return {
306
+ "nodes": [n.to_dict() for n in self.iternodes()],
307
+ "edges": [e.to_dict() for e in self.iteredges()],
308
+ }
@@ -0,0 +1,212 @@
1
+ # This module violates the boundary between the role of code and
2
+ # YAML in the rest of followthemoney. It handles normalisations
3
+ # which would be much harder to express in abstract, especially
4
+ # those thet simplify the data based on their pragmatics.
5
+ #
6
+ # If anyone were to swap out the default model, this would
7
+ # probably be the first place to break.
8
+ from os.path import splitext
9
+ from typing import Iterable, List, Optional, Set
10
+ from normality import safe_filename
11
+ from mimetypes import guess_extension
12
+ from itertools import product
13
+ from datetime import datetime, timedelta
14
+
15
+ from followthemoney.types import registry
16
+ from followthemoney.proxy import E
17
+ from followthemoney.util import join_text
18
+
19
+ PROV_MIN_DATES = ("createdAt", "authoredAt", "publishedAt")
20
+ PROV_MAX_DATES = ("modifiedAt", "retrievedAt")
21
+
22
+
23
+ def remove_checksums(proxy: E) -> E:
24
+ """When accepting entities via a web API, it would consistute
25
+ a security risk to allow a user to submit checksum-type properties.
26
+ These can be traded in for access to said files if they exist in the
27
+ underlying content-addressed storage. It seems safest to just remove
28
+ all checksums from entities when they are untrusted user input."""
29
+ for prop in proxy.iterprops():
30
+ if prop.type == registry.checksum:
31
+ proxy.pop(prop)
32
+ return proxy
33
+
34
+
35
+ def simplify_provenance(proxy: E) -> E:
36
+ """If there are multiple dates given for some of the provenance
37
+ fields, we can logically conclude which one is the most meaningful."""
38
+ for prop_name in PROV_MAX_DATES:
39
+ values = proxy.pop(prop_name, quiet=True)
40
+ if len(values):
41
+ proxy.set(prop_name, max(values), cleaned=True)
42
+ for prop_name in PROV_MIN_DATES:
43
+ values = proxy.pop(prop_name, quiet=True)
44
+ if len(values):
45
+ proxy.set(prop_name, min(values), cleaned=True)
46
+ return proxy
47
+
48
+
49
+ def entity_filename(
50
+ proxy: E, base_name: Optional[str] = None, extension: Optional[str] = None
51
+ ) -> Optional[str]:
52
+ """Derive a safe filename for the given entity."""
53
+ if proxy.schema.is_a("Document"):
54
+ for extension_ in proxy.get("extension", quiet=True):
55
+ if extension is not None:
56
+ break
57
+ extension = extension_
58
+ for file_name in proxy.get("fileName", quiet=True):
59
+ base_name_, extension_ = splitext(file_name)
60
+ if base_name is None and len(base_name_):
61
+ base_name = base_name_
62
+ if extension is None and len(extension_):
63
+ extension = extension_
64
+ for mime_type in proxy.get("mimeType", quiet=True):
65
+ if extension is not None:
66
+ break
67
+ extension = guess_extension(mime_type)
68
+ base_name = base_name or proxy.id
69
+ return safe_filename(base_name, extension=extension)
70
+
71
+
72
+ def name_entity(entity: E) -> E:
73
+ """If an entity has multiple names, pick the most central one
74
+ and set all the others as aliases. This is awkward given that
75
+ names are not special and may not always be the caption."""
76
+ if entity.schema.is_a("Thing"):
77
+ names = entity.get("name")
78
+ if len(names) > 1:
79
+ name = registry.name.pick(names)
80
+ if name in names:
81
+ names.remove(name)
82
+ entity.set("name", name)
83
+ entity.add("alias", names)
84
+ return entity
85
+
86
+
87
+ def check_person_cutoff(
88
+ entity: E,
89
+ death_cutoff: datetime = datetime(2000, 1, 1),
90
+ birth_cutoff: Optional[datetime] = None,
91
+ ) -> bool:
92
+ """Check if a person has been dead long enough to not be relevant for
93
+ investigations any more."""
94
+ if not entity.schema.is_a("Person"):
95
+ return False
96
+ death_dates = entity.get("deathDate", quiet=True)
97
+ death_cutoff_ = death_cutoff.isoformat()
98
+ if len(death_dates) and max(death_dates) < death_cutoff_:
99
+ return True
100
+ birth_dates = entity.get("birthDate", quiet=True)
101
+ if birth_cutoff is None:
102
+ birth_cutoff = death_cutoff - timedelta(days=100 * 365)
103
+ birth_cutoff_ = birth_cutoff.isoformat()
104
+ if len(birth_dates) and min(birth_dates) < birth_cutoff_:
105
+ return True
106
+ return False
107
+
108
+
109
+ def remove_prefix_dates(entity: E) -> E:
110
+ """If an entity has multiple values for a date field, you may
111
+ want to remove all those that are prefixes of others. For example,
112
+ if a Person has both a birthDate of 1990 and of 1990-05-01, we'd
113
+ want to drop the mention of 1990."""
114
+ for prop in entity.iterprops():
115
+ if prop.type == registry.date:
116
+ values = remove_prefix_date_values(entity.get(prop))
117
+ entity.set(prop, values)
118
+ return entity
119
+
120
+
121
+ def remove_prefix_date_values(values: Iterable[str]) -> List[str]:
122
+ """See ``remove_prefix_dates``."""
123
+ kept: List[str] = []
124
+ values = sorted(values, key=len, reverse=True)
125
+ for index, value in enumerate(values):
126
+ keep = True
127
+ for longer in values[:index]:
128
+ if longer.startswith(value):
129
+ keep = False
130
+ break
131
+ if keep:
132
+ kept.append(value)
133
+ return kept
134
+
135
+
136
+ def inline_names(entity: E, related: E) -> None:
137
+ """Attempt to solve a weird UI problem. Imagine we are showing a list of
138
+ payments between a sender and a beneficiary to a user. They may now conduct
139
+ a search for a term present in the sender or recipient name, but there will
140
+ be no result, because the name is only indexed with the parties, but not in
141
+ the payment. This is part of a partial work-around to that.
142
+
143
+ This is really bad in theory, but really useful in practice. Shoot me.
144
+ """
145
+ prop = entity.schema.get("namesMentioned")
146
+ if prop is not None:
147
+ entity.add(prop, related.get_type_values(registry.name))
148
+
149
+
150
+ def combine_names(entity: E) -> E:
151
+ """This function will try to build names from name parts provided as part
152
+ of a person entity. This is of course impossible to do culturally correctly
153
+ for the whole planet at once, so it should be mostly used for internal-facing
154
+ (e.g. matching) processes."""
155
+ if entity.schema.is_a("Person"):
156
+ first_names = entity.get("firstName")
157
+ second_names = entity.get("secondName")
158
+ second_names.append("")
159
+ middle_names = entity.get("middleName")
160
+ middle_names.append("")
161
+ father_names = entity.get("fatherName")
162
+ father_names.append("")
163
+ last_names = entity.get("lastName")
164
+ for (first, second, middle, father, last) in product(
165
+ first_names, second_names, middle_names, father_names, last_names
166
+ ):
167
+ name = join_text(first, second, middle, father, last)
168
+ if name is not None:
169
+ entity.add("alias", name)
170
+
171
+ # If no first name is given, at least add the last name:
172
+ if not entity.get_type_values(registry.name) and len(last_names):
173
+ entity.add("alias", last_names)
174
+ return entity
175
+
176
+
177
+ def dates_years(dates: Iterable[Optional[str]]) -> Set[str]:
178
+ """Get the unique years from a set of date strings."""
179
+ cleaned: Set[str] = set()
180
+ for date in dates:
181
+ if date is not None:
182
+ cleaned.add(date[:4])
183
+ return cleaned
184
+
185
+
186
+ def post_summary(
187
+ organization: str,
188
+ role: Optional[str],
189
+ start_dates: Iterable[Optional[str]],
190
+ end_dates: Iterable[Optional[str]],
191
+ dates: Iterable[Optional[str]],
192
+ ) -> str:
193
+ """Make a string summary for a Post object."""
194
+ position = organization
195
+ start = min(dates_years(start_dates), default="")
196
+ end = min(dates_years(end_dates), default="")
197
+ date_range = None
198
+ if len(start) or len(end):
199
+ date_range = f"{start}-{end}"
200
+ dates_ = dates_years(dates)
201
+ if date_range is None and len(dates_):
202
+ date_range = ", ".join(sorted(dates_))
203
+
204
+ bracketed = None
205
+ if date_range and role:
206
+ bracketed = f"{role}, {date_range}"
207
+ else:
208
+ bracketed = role or date_range
209
+
210
+ if bracketed:
211
+ position = f"{position} ({bracketed})"
212
+ return position
@@ -1,3 +1,3 @@
1
1
  from followthemoney.mapping.query import QueryMapping
2
2
 
3
- __all__ = [QueryMapping]
3
+ __all__ = ["QueryMapping"]
@@ -1,65 +1,97 @@
1
1
  import io
2
2
  import os
3
3
  import logging
4
+ from banal.lists import ensure_list
4
5
  import requests
5
6
  from csv import DictReader
6
- from banal import ensure_list
7
- from normality import stringify
7
+ from urllib.parse import urlparse
8
+ from banal import keys_values
9
+ from typing import (
10
+ TYPE_CHECKING,
11
+ Any,
12
+ Dict,
13
+ Generator,
14
+ ItemsView,
15
+ Iterable,
16
+ List,
17
+ Optional,
18
+ Set,
19
+ Tuple,
20
+ cast,
21
+ )
8
22
 
9
- from followthemoney.mapping.source import Source
23
+ from followthemoney.mapping.source import Record, Source
24
+ from followthemoney.util import sanitize_text
10
25
  from followthemoney.exc import InvalidMapping
11
26
 
27
+ if TYPE_CHECKING:
28
+ from followthemoney.mapping.query import QueryMapping
29
+
12
30
  log = logging.getLogger(__name__)
31
+ FilterList = List[Tuple[str, Set[Optional[str]]]]
13
32
 
14
33
 
15
34
  class CSVSource(Source):
16
35
  """Special case for entity loading directly from a CSV URL"""
17
36
 
18
- def __init__(self, query, data):
19
- super(CSVSource, self).__init__(query, data)
20
- urls = ensure_list(data.get('csv_url'))
21
- urls.extend(ensure_list(data.get('csv_urls')))
22
- self.urls = set()
23
- for url in urls:
24
- self.urls.add(os.path.expandvars(url))
37
+ def __init__(self, query: "QueryMapping", data: Dict[str, Any]) -> None:
38
+ super().__init__(query, data)
39
+ self.urls: Set[str] = set()
40
+ for url in keys_values(data, "csv_url", "csv_urls"):
41
+ self.urls.add(cast(str, os.path.expandvars(url)))
25
42
 
26
43
  if not len(self.urls):
27
44
  raise InvalidMapping("No CSV URLs are specified.")
28
45
 
29
- def read_csv(self, url):
30
- parsed_url = requests.utils.urlparse(url)
46
+ self.filters_set = self._parse_filters(self.filters)
47
+ self.filters_not_set = self._parse_filters(self.filters_not)
48
+
49
+ def _parse_filters(self, filters: ItemsView[str, Any]) -> FilterList:
50
+ filters_set: FilterList = []
51
+ for (key, value) in filters:
52
+ values = set(cast(List[Optional[str]], ensure_list(value)))
53
+ filters_set.append((key, values))
54
+ return filters_set
55
+
56
+ def check_filters(self, data: Record) -> bool:
57
+ for (k, v) in self.filters_set:
58
+ if data.get(k) not in v:
59
+ return False
60
+ for (k, v) in self.filters_not_set:
61
+ if data.get(k) in v:
62
+ return False
63
+ return True
64
+
65
+ @classmethod
66
+ def read_csv(cls, fh: Iterable[str]) -> Generator[Record, None, None]:
67
+ for row in DictReader(fh, skipinitialspace=True):
68
+ data: Record = {}
69
+ for ref, ref_value in row.items():
70
+ value = sanitize_text(ref_value)
71
+ if value is not None:
72
+ data[ref] = value
73
+ yield data
74
+
75
+ def read_csv_url(self, url: str) -> Generator[Record, None, None]:
76
+ parsed_url = urlparse(url)
31
77
  log.info("Loading: %s", url)
32
- if parsed_url.scheme in ['http', 'https']:
78
+ if parsed_url.scheme in ["http", "https"]:
33
79
  res = requests.get(url, stream=True)
34
80
  if not res.ok:
35
81
  raise InvalidMapping("Failed to open CSV: %s" % url)
36
82
  # if res.encoding is None:
37
- res.encoding = 'utf-8'
83
+ res.encoding = "utf-8"
38
84
  # log.info("Detected encoding: %s", res.encoding)
39
85
  lines = res.iter_lines(decode_unicode=True)
40
- for row in DictReader(lines, skipinitialspace=True):
41
- yield row
86
+ yield from self.read_csv(lines)
42
87
  else:
43
- with io.open(parsed_url.path, 'r') as fh:
44
- for row in DictReader(fh, skipinitialspace=True):
45
- yield row
46
-
47
- def check_filters(self, data):
48
- for (k, v) in self.filters:
49
- if v != data.get(k):
50
- return False
51
- for (k, v) in self.filters_not:
52
- if v == data.get(k):
53
- return False
54
- return True
88
+ with io.open(parsed_url.path, "r") as fh:
89
+ yield from self.read_csv(fh)
55
90
 
56
91
  @property
57
- def records(self):
92
+ def records(self) -> Generator[Record, None, None]:
58
93
  """Iterate through the table applying filters on-the-go."""
59
94
  for url in self.urls:
60
- for row in self.read_csv(url):
61
- data = {}
62
- for ref in self.query.refs:
63
- data[ref] = stringify(row.get(ref))
64
- if self.check_filters(data):
65
- yield data
95
+ for record in self.read_csv_url(url):
96
+ if self.check_filters(record):
97
+ yield record