followthemoney 1.3.6__py3-none-any.whl → 3.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- followthemoney/__init__.py +5 -3
- followthemoney/cli/__init__.py +17 -0
- followthemoney/cli/aggregate.py +56 -0
- followthemoney/cli/cli.py +88 -0
- followthemoney/cli/exports.py +121 -0
- followthemoney/cli/mapping.py +85 -0
- followthemoney/cli/sieve.py +67 -0
- followthemoney/cli/util.py +142 -0
- followthemoney/compare.py +132 -55
- followthemoney/exc.py +19 -6
- followthemoney/export/common.py +29 -0
- followthemoney/export/csv.py +82 -0
- followthemoney/export/excel.py +75 -0
- followthemoney/export/graph.py +79 -0
- followthemoney/export/neo4j.py +182 -0
- followthemoney/export/rdf.py +26 -0
- followthemoney/graph.py +308 -0
- followthemoney/helpers.py +212 -0
- followthemoney/mapping/__init__.py +1 -1
- followthemoney/mapping/csv.py +67 -35
- followthemoney/mapping/entity.py +116 -44
- followthemoney/mapping/property.py +90 -44
- followthemoney/mapping/query.py +27 -19
- followthemoney/mapping/source.py +15 -5
- followthemoney/mapping/sql.py +75 -61
- followthemoney/messages.py +13 -7
- followthemoney/model.py +108 -56
- followthemoney/namespace.py +119 -0
- followthemoney/offshore.py +48 -0
- followthemoney/ontology.py +77 -0
- followthemoney/property.py +204 -71
- followthemoney/proxy.py +455 -118
- followthemoney/rdf.py +9 -0
- followthemoney/schema/Address.yaml +78 -0
- followthemoney/schema/Airplane.yaml +17 -10
- followthemoney/schema/Analyzable.yaml +54 -0
- followthemoney/schema/Article.yaml +16 -0
- followthemoney/schema/Assessment.yaml +32 -0
- followthemoney/schema/Asset.yaml +10 -4
- followthemoney/schema/Associate.yaml +41 -0
- followthemoney/schema/Audio.yaml +24 -0
- followthemoney/schema/BankAccount.yaml +53 -9
- followthemoney/schema/Call.yaml +48 -0
- followthemoney/schema/CallForTenders.yaml +117 -0
- followthemoney/schema/Company.yaml +37 -12
- followthemoney/schema/Contract.yaml +41 -7
- followthemoney/schema/ContractAward.yaml +30 -11
- followthemoney/schema/CourtCase.yaml +16 -10
- followthemoney/schema/CourtCaseParty.yaml +17 -6
- followthemoney/schema/CryptoWallet.yaml +48 -0
- followthemoney/schema/Debt.yaml +37 -0
- followthemoney/schema/Directorship.yaml +17 -4
- followthemoney/schema/Document.yaml +72 -139
- followthemoney/schema/Documentation.yml +38 -0
- followthemoney/schema/EconomicActivity.yaml +32 -17
- followthemoney/schema/Email.yaml +76 -0
- followthemoney/schema/Employment.yaml +39 -0
- followthemoney/schema/Event.yaml +35 -3
- followthemoney/schema/Family.yaml +41 -0
- followthemoney/schema/Folder.yaml +13 -0
- followthemoney/schema/HyperText.yaml +21 -0
- followthemoney/schema/Identification.yaml +40 -0
- followthemoney/schema/Image.yaml +25 -0
- followthemoney/schema/Interest.yaml +3 -6
- followthemoney/schema/Interval.yaml +56 -5
- followthemoney/schema/LegalEntity.yaml +81 -20
- followthemoney/schema/License.yaml +7 -3
- followthemoney/schema/Membership.yaml +19 -4
- followthemoney/schema/Mention.yaml +54 -0
- followthemoney/schema/Message.yaml +73 -0
- followthemoney/schema/Note.yaml +23 -0
- followthemoney/schema/Occupancy.yaml +40 -0
- followthemoney/schema/Organization.yaml +38 -3
- followthemoney/schema/Ownership.yaml +16 -4
- followthemoney/schema/Package.yaml +17 -0
- followthemoney/schema/Page.yaml +43 -0
- followthemoney/schema/Pages.yaml +23 -0
- followthemoney/schema/Passport.yaml +15 -17
- followthemoney/schema/Payment.yaml +38 -7
- followthemoney/schema/Person.yaml +61 -5
- followthemoney/schema/PlainText.yaml +17 -0
- followthemoney/schema/Position.yaml +50 -0
- followthemoney/schema/Post.yaml +42 -0
- followthemoney/schema/Project.yaml +27 -0
- followthemoney/schema/ProjectParticipant.yaml +36 -0
- followthemoney/schema/PublicBody.yaml +14 -3
- followthemoney/schema/RealEstate.yaml +19 -3
- followthemoney/schema/Representation.yaml +17 -6
- followthemoney/schema/Sanction.yaml +44 -20
- followthemoney/schema/Security.yaml +59 -0
- followthemoney/schema/Similar.yaml +37 -0
- followthemoney/schema/Succession.yaml +36 -0
- followthemoney/schema/Table.yaml +32 -0
- followthemoney/schema/TaxRoll.yaml +27 -9
- followthemoney/schema/Thing.yaml +69 -13
- followthemoney/schema/Trip.yaml +42 -0
- followthemoney/schema/UnknownLink.yaml +17 -6
- followthemoney/schema/UserAccount.yaml +44 -0
- followthemoney/schema/Value.yaml +5 -1
- followthemoney/schema/Vehicle.yaml +25 -8
- followthemoney/schema/Vessel.yaml +18 -10
- followthemoney/schema/Video.yaml +20 -0
- followthemoney/schema/Workbook.yaml +18 -0
- followthemoney/schema.py +406 -135
- followthemoney/translations/ar/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/ar/LC_MESSAGES/followthemoney.po +2900 -787
- followthemoney/translations/bs/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/bs/LC_MESSAGES/followthemoney.po +2108 -520
- followthemoney/translations/de/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/de/LC_MESSAGES/followthemoney.po +2902 -782
- followthemoney/translations/es/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/es/LC_MESSAGES/followthemoney.po +2893 -779
- followthemoney/translations/fr/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/fr/LC_MESSAGES/followthemoney.po +4362 -0
- followthemoney/translations/fr/followthemoney.po +3861 -0
- followthemoney/translations/messages.pot +3021 -725
- followthemoney/translations/nb/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/nb/LC_MESSAGES/followthemoney.po +3778 -0
- followthemoney/translations/nl/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/nl/LC_MESSAGES/followthemoney.po +3837 -0
- followthemoney/translations/pt_BR/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/pt_BR/LC_MESSAGES/followthemoney.po +3784 -0
- followthemoney/translations/ru/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/ru/LC_MESSAGES/followthemoney.po +2837 -539
- followthemoney/translations/ru/followthemoney.po +4221 -0
- followthemoney/translations/tr/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/tr/LC_MESSAGES/followthemoney.po +2073 -491
- followthemoney/types/__init__.py +35 -17
- followthemoney/types/address.py +41 -21
- followthemoney/types/checksum.py +25 -0
- followthemoney/types/common.py +233 -88
- followthemoney/types/country.py +89 -56
- followthemoney/types/date.py +59 -76
- followthemoney/types/email.py +66 -35
- followthemoney/types/entity.py +66 -13
- followthemoney/types/gender.py +66 -0
- followthemoney/types/iban.py +47 -28
- followthemoney/types/identifier.py +49 -22
- followthemoney/types/ip.py +35 -21
- followthemoney/types/json.py +58 -0
- followthemoney/types/language.py +124 -37
- followthemoney/types/mimetype.py +44 -0
- followthemoney/types/name.py +56 -12
- followthemoney/types/number.py +30 -0
- followthemoney/types/phone.py +92 -34
- followthemoney/types/registry.py +52 -0
- followthemoney/types/string.py +43 -0
- followthemoney/types/topic.py +94 -0
- followthemoney/types/url.py +39 -17
- followthemoney/util.py +139 -45
- followthemoney-3.8.0.dist-info/METADATA +153 -0
- followthemoney-3.8.0.dist-info/RECORD +157 -0
- {followthemoney-1.3.6.dist-info → followthemoney-3.8.0.dist-info}/WHEEL +1 -2
- followthemoney-3.8.0.dist-info/entry_points.txt +17 -0
- followthemoney-1.3.6.dist-info/LICENSE.txt → followthemoney-3.8.0.dist-info/licenses/LICENSE +1 -1
- followthemoney/link.py +0 -75
- followthemoney/schema/Associate.yml +0 -19
- followthemoney/schema/Family.yml +0 -19
- followthemoney/schema/Land.yml +0 -9
- followthemoney/schema/Relationship.yaml +0 -26
- followthemoney/types/domain.py +0 -50
- followthemoney-1.3.6.dist-info/DESCRIPTION.rst +0 -3
- followthemoney-1.3.6.dist-info/METADATA +0 -39
- followthemoney-1.3.6.dist-info/RECORD +0 -108
- followthemoney-1.3.6.dist-info/entry_points.txt +0 -3
- followthemoney-1.3.6.dist-info/metadata.json +0 -1
- followthemoney-1.3.6.dist-info/namespace_packages.txt +0 -1
- followthemoney-1.3.6.dist-info/top_level.txt +0 -3
- ns/ontology.py +0 -128
- tests/types/test_addresses.py +0 -24
- tests/types/test_common.py +0 -27
- tests/types/test_countries.py +0 -21
- tests/types/test_dates.py +0 -72
- tests/types/test_domains.py +0 -23
- tests/types/test_emails.py +0 -30
- tests/types/test_entity.py +0 -16
- tests/types/test_iban.py +0 -109
- tests/types/test_identifiers.py +0 -25
- tests/types/test_ip.py +0 -26
- tests/types/test_languages.py +0 -20
- tests/types/test_names.py +0 -33
- tests/types/test_phones.py +0 -24
- tests/types/test_registry.py +0 -14
- tests/types/test_urls.py +0 -23
- {ns → followthemoney/export}/__init__.py +0 -0
- /tests/types/__init__.py → /followthemoney/py.typed +0 -0
followthemoney/compare.py
CHANGED
|
@@ -1,64 +1,141 @@
|
|
|
1
|
+
import math
|
|
1
2
|
import itertools
|
|
2
|
-
from
|
|
3
|
+
from typing import Dict, Generator, Iterable, List, Optional
|
|
4
|
+
import fingerprints
|
|
3
5
|
from normality import normalize
|
|
4
|
-
from followthemoney.types import registry
|
|
5
|
-
from followthemoney.util import dampen
|
|
6
6
|
from followthemoney.exc import InvalidData
|
|
7
|
+
from followthemoney.model import Model
|
|
8
|
+
from followthemoney.types import registry
|
|
9
|
+
from followthemoney.proxy import EntityProxy
|
|
10
|
+
from followthemoney.types.common import PropertyType
|
|
7
11
|
|
|
8
|
-
|
|
9
|
-
#
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
registry.
|
|
15
|
-
registry.
|
|
16
|
-
registry.identifier:
|
|
17
|
-
registry.
|
|
18
|
-
registry.
|
|
19
|
-
registry.
|
|
20
|
-
registry.iban: 0.
|
|
21
|
-
registry.
|
|
22
|
-
|
|
23
|
-
registry.phone: 0.1,
|
|
24
|
-
registry.country: 0.1,
|
|
25
|
-
registry.language: 0.1,
|
|
12
|
+
|
|
13
|
+
# Compare weights come from the glm-bernouli model in followthemoney-predict
|
|
14
|
+
Weights = Dict[Optional[PropertyType], float]
|
|
15
|
+
Scores = Dict[PropertyType, Optional[float]]
|
|
16
|
+
COMPARE_WEIGHTS: Weights = {
|
|
17
|
+
registry.name: 12.275729155073371,
|
|
18
|
+
registry.country: 1.0494517476987815,
|
|
19
|
+
registry.date: 6.960245940274218,
|
|
20
|
+
registry.identifier: 5.2209896558064175,
|
|
21
|
+
registry.address: 6.456137299747168,
|
|
22
|
+
registry.phone: 3.538892687331418,
|
|
23
|
+
registry.email: 14.115925628770384,
|
|
24
|
+
registry.iban: 0.019140301711998726,
|
|
25
|
+
registry.url: 3.211995327345834,
|
|
26
|
+
None: -11.91521189545115,
|
|
26
27
|
}
|
|
27
28
|
|
|
28
29
|
|
|
29
|
-
def
|
|
30
|
-
"""Compare two entities and return
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
continue
|
|
30
|
+
def compare_scores(model: Model, left: EntityProxy, right: EntityProxy) -> Scores:
|
|
31
|
+
"""Compare two entities and return a match score for each property."""
|
|
32
|
+
try:
|
|
33
|
+
model.common_schema(left.schema, right.schema)
|
|
34
|
+
except InvalidData:
|
|
35
|
+
return {}
|
|
36
|
+
scores: Scores = {}
|
|
37
|
+
left_inv = left.get_type_inverted(matchable=True)
|
|
38
|
+
right_inv = right.get_type_inverted(matchable=True)
|
|
39
|
+
left_groups = set(left_inv.keys())
|
|
40
|
+
right_groups = set(right_inv.keys())
|
|
41
|
+
for group_name in left_groups.intersection(right_groups):
|
|
42
|
+
group = registry.groups[group_name]
|
|
43
43
|
try:
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
44
|
+
if group == registry.name:
|
|
45
|
+
score = compare_names(left, right)
|
|
46
|
+
elif group == registry.country:
|
|
47
|
+
score = compare_countries(left, right)
|
|
48
|
+
else:
|
|
49
|
+
score = compare_group(
|
|
50
|
+
group, left_inv[group_name], right_inv[group_name]
|
|
51
|
+
)
|
|
52
|
+
scores[group] = score
|
|
53
|
+
except ValueError:
|
|
54
|
+
pass
|
|
55
|
+
for group_name in left_groups.symmetric_difference(right_groups):
|
|
56
|
+
group = registry.groups[group_name]
|
|
57
|
+
scores[group] = None
|
|
58
|
+
return scores
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _compare(scores: Scores, weights: Weights, n_std: int = 1) -> float:
|
|
62
|
+
if not scores or not any(scores.values()):
|
|
63
|
+
return 0.0
|
|
64
|
+
prob = 0.0
|
|
65
|
+
for field, weight in weights.items():
|
|
66
|
+
if field:
|
|
67
|
+
prob += weight * (scores.get(field) or 0.0)
|
|
68
|
+
else:
|
|
69
|
+
prob += weight
|
|
70
|
+
return 1.0 / (1.0 + math.exp(-prob))
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def compare(
|
|
74
|
+
model: Model,
|
|
75
|
+
left: EntityProxy,
|
|
76
|
+
right: EntityProxy,
|
|
77
|
+
weights: Weights = COMPARE_WEIGHTS,
|
|
78
|
+
) -> float:
|
|
79
|
+
"""Compare two entities and return a match score."""
|
|
80
|
+
scores = compare_scores(model, left, right)
|
|
81
|
+
return _compare(scores, weights)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _normalize_names(names: Iterable[str]) -> Generator[str, None, None]:
|
|
85
|
+
"""Generate a sequence of comparable names for an entity. This also
|
|
86
|
+
generates a `fingerprint`, i.e. a version of the name where all tokens
|
|
87
|
+
are sorted alphabetically, and some parts, such as company suffixes,
|
|
88
|
+
have been removed."""
|
|
89
|
+
seen = set()
|
|
90
|
+
for name in names:
|
|
91
|
+
plain = normalize(name, ascii=True)
|
|
92
|
+
if plain is not None and plain not in seen:
|
|
93
|
+
seen.add(plain)
|
|
94
|
+
yield plain
|
|
95
|
+
fp = fingerprints.generate(name)
|
|
96
|
+
if fp is not None and len(fp) > 6 and fp not in seen:
|
|
97
|
+
seen.add(fp)
|
|
98
|
+
yield fp
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def compare_group(
|
|
102
|
+
group_type: PropertyType, left_values: List[str], right_values: List[str]
|
|
103
|
+
) -> Optional[float]:
|
|
104
|
+
if not left_values and not right_values:
|
|
105
|
+
raise ValueError("At least one proxy must have property type: %s", group_type)
|
|
106
|
+
elif not left_values or not right_values:
|
|
107
|
+
return None
|
|
108
|
+
return group_type.compare_sets(left_values, right_values)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def compare_names(
|
|
112
|
+
left: EntityProxy, right: EntityProxy, max_names: int = 200
|
|
113
|
+
) -> Optional[float]:
|
|
114
|
+
result = 0.0
|
|
115
|
+
left_list = list(itertools.islice(_normalize_names(left.names), max_names))
|
|
116
|
+
right_list = list(itertools.islice(_normalize_names(right.names), max_names))
|
|
117
|
+
if not left_list and not right_list:
|
|
118
|
+
raise ValueError("At least one proxy must have name properties")
|
|
119
|
+
elif not left_list or not right_list:
|
|
120
|
+
return None
|
|
121
|
+
for (left_val, right_val) in itertools.product(left_list, right_list):
|
|
122
|
+
similarity = registry.name.compare(left_val, right_val)
|
|
123
|
+
result = max(result, similarity)
|
|
124
|
+
if result == 1.0:
|
|
125
|
+
break
|
|
126
|
+
result *= min(
|
|
127
|
+
1.0, 2 ** (-len(left_list) * len(right_list) / (max_names * max_names))
|
|
128
|
+
)
|
|
64
129
|
return result
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def compare_countries(left: EntityProxy, right: EntityProxy) -> Optional[float]:
|
|
133
|
+
left_countries = left.country_hints
|
|
134
|
+
right_countries = right.country_hints
|
|
135
|
+
if not left_countries and not right_countries:
|
|
136
|
+
raise ValueError("At least one proxy must have country properties")
|
|
137
|
+
elif not left_countries or not right_countries:
|
|
138
|
+
return None
|
|
139
|
+
intersection = left_countries.intersection(right_countries)
|
|
140
|
+
union = left_countries.union(right_countries)
|
|
141
|
+
return len(intersection) / float(len(union))
|
followthemoney/exc.py
CHANGED
|
@@ -1,18 +1,31 @@
|
|
|
1
|
+
from typing import Dict, Optional, TypedDict
|
|
1
2
|
|
|
2
3
|
|
|
3
|
-
class
|
|
4
|
+
class ErrorSpec(TypedDict, total=False):
|
|
5
|
+
properties: Dict[str, str]
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class FollowTheMoneyException(Exception):
|
|
9
|
+
"""Catch-all exception for errors emitted by this library."""
|
|
10
|
+
|
|
11
|
+
pass
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class InvalidData(FollowTheMoneyException):
|
|
4
15
|
"""Schema validation errors will be caught by the API."""
|
|
5
16
|
|
|
6
|
-
def __init__(self, errors):
|
|
7
|
-
self.
|
|
8
|
-
|
|
17
|
+
def __init__(self, message: str, errors: Optional[ErrorSpec] = None) -> None:
|
|
18
|
+
super(InvalidData, self).__init__(message)
|
|
19
|
+
self.errors: ErrorSpec = errors or {}
|
|
9
20
|
|
|
10
21
|
|
|
11
|
-
class InvalidModel(
|
|
22
|
+
class InvalidModel(FollowTheMoneyException):
|
|
12
23
|
"""The schema model is not defined correctly."""
|
|
24
|
+
|
|
13
25
|
pass
|
|
14
26
|
|
|
15
27
|
|
|
16
|
-
class InvalidMapping(
|
|
28
|
+
class InvalidMapping(FollowTheMoneyException):
|
|
17
29
|
"""A data mapping was invalid."""
|
|
30
|
+
|
|
18
31
|
pass
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
from typing import Generator, List, Optional, Tuple
|
|
2
|
+
from followthemoney.property import Property
|
|
3
|
+
from followthemoney.proxy import E
|
|
4
|
+
from followthemoney.schema import Schema
|
|
5
|
+
from followthemoney.types import registry
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Exporter(object):
|
|
9
|
+
def __init__(self, export_all: bool = False) -> None:
|
|
10
|
+
self.export_all = export_all
|
|
11
|
+
|
|
12
|
+
def exportable_properties(self, schema: Schema) -> Generator[Property, None, None]:
|
|
13
|
+
for prop in schema.sorted_properties:
|
|
14
|
+
if not self.export_all:
|
|
15
|
+
if prop.hidden or prop.type == registry.entity:
|
|
16
|
+
continue
|
|
17
|
+
yield prop
|
|
18
|
+
|
|
19
|
+
def exportable_fields(
|
|
20
|
+
self, proxy: E
|
|
21
|
+
) -> Generator[Tuple[Property, List[str]], None, None]:
|
|
22
|
+
for prop in self.exportable_properties(proxy.schema):
|
|
23
|
+
yield prop, proxy.get(prop)
|
|
24
|
+
|
|
25
|
+
def write(self, proxy: E, extra: Optional[List[str]] = None) -> None:
|
|
26
|
+
raise NotImplementedError
|
|
27
|
+
|
|
28
|
+
def finalize(self) -> None:
|
|
29
|
+
pass
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
|
|
3
|
+
try:
|
|
4
|
+
from _csv import _writer as csv_writer
|
|
5
|
+
except ImportError:
|
|
6
|
+
# Python 3.8/3.9 work-around:
|
|
7
|
+
from _csv import writer as csv_writer # type: ignore
|
|
8
|
+
|
|
9
|
+
from io import TextIOWrapper
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Dict, List, Optional, Tuple
|
|
12
|
+
|
|
13
|
+
from followthemoney.proxy import E
|
|
14
|
+
from followthemoney.export.common import Exporter
|
|
15
|
+
from followthemoney.schema import Schema
|
|
16
|
+
from followthemoney.util import PathLike
|
|
17
|
+
|
|
18
|
+
CSVWriter = csv_writer
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class CSVMixin(object):
|
|
22
|
+
def _configure(
|
|
23
|
+
self,
|
|
24
|
+
directory: PathLike,
|
|
25
|
+
extra: Optional[List[str]] = None,
|
|
26
|
+
) -> None:
|
|
27
|
+
self.directory = Path(directory)
|
|
28
|
+
self.extra = extra or []
|
|
29
|
+
self.handles: Dict[Schema, Tuple[TextIOWrapper, CSVWriter]] = {}
|
|
30
|
+
|
|
31
|
+
def _open_csv_file(self, name: str) -> Tuple[TextIOWrapper, CSVWriter]:
|
|
32
|
+
self.directory.mkdir(parents=True, exist_ok=True)
|
|
33
|
+
file_path = self.directory.joinpath("{0}.csv".format(name))
|
|
34
|
+
handle = open(file_path, mode="w")
|
|
35
|
+
writer = csv.writer(handle, dialect=csv.unix_dialect)
|
|
36
|
+
return handle, writer
|
|
37
|
+
|
|
38
|
+
def _write_header(self, writer: CSVWriter, schema: Schema) -> None:
|
|
39
|
+
raise NotImplementedError
|
|
40
|
+
|
|
41
|
+
def _get_writer(self, schema: Schema) -> CSVWriter:
|
|
42
|
+
if schema not in self.handles:
|
|
43
|
+
handle, writer = self._open_csv_file(schema.name)
|
|
44
|
+
self.handles[schema] = (handle, writer)
|
|
45
|
+
self._write_header(writer, schema)
|
|
46
|
+
handle, writer = self.handles[schema]
|
|
47
|
+
return writer
|
|
48
|
+
|
|
49
|
+
def close(self) -> None:
|
|
50
|
+
for handle, _ in self.handles.values():
|
|
51
|
+
handle.close()
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class CSVExporter(Exporter, CSVMixin):
|
|
55
|
+
def __init__(
|
|
56
|
+
self,
|
|
57
|
+
directory: PathLike,
|
|
58
|
+
export_all: bool = True,
|
|
59
|
+
extra: Optional[List[str]] = None,
|
|
60
|
+
) -> None:
|
|
61
|
+
Exporter.__init__(self, export_all=export_all)
|
|
62
|
+
self._configure(directory, extra=extra)
|
|
63
|
+
|
|
64
|
+
def _write_header(self, writer: CSVWriter, schema: Schema) -> None:
|
|
65
|
+
headers = ["id"]
|
|
66
|
+
headers.extend(self.extra)
|
|
67
|
+
for prop in self.exportable_properties(schema):
|
|
68
|
+
# Not using label to make it more machine-readable:
|
|
69
|
+
headers.append(prop.name)
|
|
70
|
+
writer.writerow(headers)
|
|
71
|
+
|
|
72
|
+
def write(self, proxy: E, extra: Optional[List[str]] = None) -> None:
|
|
73
|
+
writer = self._get_writer(proxy.schema)
|
|
74
|
+
cells = [proxy.id]
|
|
75
|
+
cells.extend(extra or [])
|
|
76
|
+
for prop, values in self.exportable_fields(proxy):
|
|
77
|
+
cells.append(prop.type.join(values))
|
|
78
|
+
|
|
79
|
+
writer.writerow(cells)
|
|
80
|
+
|
|
81
|
+
def finalize(self) -> None:
|
|
82
|
+
self.close()
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from io import BytesIO
|
|
3
|
+
from typing import Dict, List, Optional
|
|
4
|
+
from openpyxl import Workbook
|
|
5
|
+
from openpyxl.cell import WriteOnlyCell
|
|
6
|
+
from openpyxl.styles import Font, PatternFill
|
|
7
|
+
from openpyxl.worksheet.worksheet import Worksheet
|
|
8
|
+
from openpyxl.utils.exceptions import IllegalCharacterError
|
|
9
|
+
|
|
10
|
+
from followthemoney.export.common import Exporter
|
|
11
|
+
from followthemoney.proxy import E
|
|
12
|
+
from followthemoney.schema import Schema
|
|
13
|
+
from followthemoney.util import PathLike, sanitize_text
|
|
14
|
+
|
|
15
|
+
log = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class ExcelWriter(object):
|
|
19
|
+
HEADER_FONT = Font(bold=True, color="FFFFFF")
|
|
20
|
+
HEADER_FILL = PatternFill(
|
|
21
|
+
start_color="982022", end_color="982022", fill_type="solid"
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
def __init__(self) -> None:
|
|
25
|
+
self.workbook = Workbook(write_only=True)
|
|
26
|
+
|
|
27
|
+
def make_sheet(self, title: str, headers: List[str]) -> Worksheet:
|
|
28
|
+
sheet: Worksheet = self.workbook.create_sheet(title=title)
|
|
29
|
+
sheet.freeze_panes = "A2"
|
|
30
|
+
sheet.sheet_properties.filterMode = True
|
|
31
|
+
cells = []
|
|
32
|
+
for header in headers:
|
|
33
|
+
header_ = sanitize_text(header)
|
|
34
|
+
cell = WriteOnlyCell(sheet, value=header_)
|
|
35
|
+
cell.font = self.HEADER_FONT
|
|
36
|
+
cell.fill = self.HEADER_FILL
|
|
37
|
+
cells.append(cell)
|
|
38
|
+
sheet.append(cells)
|
|
39
|
+
return sheet
|
|
40
|
+
|
|
41
|
+
def get_bytesio(self) -> BytesIO:
|
|
42
|
+
buffer = BytesIO()
|
|
43
|
+
self.workbook.save(buffer)
|
|
44
|
+
buffer.seek(0)
|
|
45
|
+
return buffer
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class ExcelExporter(ExcelWriter, Exporter):
|
|
49
|
+
def __init__(self, file_path: PathLike, extra: Optional[List[str]] = None):
|
|
50
|
+
ExcelWriter.__init__(self)
|
|
51
|
+
Exporter.__init__(self)
|
|
52
|
+
self.file_path = file_path
|
|
53
|
+
self.extra = extra or []
|
|
54
|
+
self.sheets: Dict[Schema, Worksheet] = {}
|
|
55
|
+
|
|
56
|
+
def write(self, proxy: E, extra: Optional[List[str]] = None) -> None:
|
|
57
|
+
if proxy.schema not in self.sheets:
|
|
58
|
+
headers = ["ID"]
|
|
59
|
+
headers.extend(self.extra)
|
|
60
|
+
for prop in self.exportable_properties(proxy.schema):
|
|
61
|
+
headers.append(prop.label)
|
|
62
|
+
sheet = self.make_sheet(proxy.schema.plural, headers)
|
|
63
|
+
self.sheets[proxy.schema] = sheet
|
|
64
|
+
sheet = self.sheets[proxy.schema]
|
|
65
|
+
try:
|
|
66
|
+
cells = [proxy.id]
|
|
67
|
+
cells.extend(extra or [])
|
|
68
|
+
for prop, values in self.exportable_fields(proxy):
|
|
69
|
+
cells.append(prop.type.join(values))
|
|
70
|
+
sheet.append(cells)
|
|
71
|
+
except IllegalCharacterError as ice:
|
|
72
|
+
log.error("Invalid text for Excel export: %s", ice)
|
|
73
|
+
|
|
74
|
+
def finalize(self) -> None:
|
|
75
|
+
self.workbook.save(self.file_path)
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
from typing import Dict, Iterable, List, Optional, TextIO, Union
|
|
2
|
+
import networkx as nx # type: ignore
|
|
3
|
+
from networkx.readwrite.gexf import generate_gexf # type: ignore
|
|
4
|
+
|
|
5
|
+
from followthemoney.graph import Edge, Graph, Node
|
|
6
|
+
from followthemoney.proxy import E
|
|
7
|
+
from followthemoney.types import registry
|
|
8
|
+
from followthemoney.export.common import Exporter
|
|
9
|
+
|
|
10
|
+
DEFAULT_EDGE_TYPES = (registry.entity.name,)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def edge_types() -> List[str]:
|
|
14
|
+
return [t.name for t in registry.matchable if t is not None]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class GraphExporter(Exporter):
|
|
18
|
+
"""Base functions for exporting a property graph from a stream
|
|
19
|
+
of entities."""
|
|
20
|
+
|
|
21
|
+
def __init__(self, edge_types: Iterable[str] = DEFAULT_EDGE_TYPES) -> None:
|
|
22
|
+
super(GraphExporter, self).__init__()
|
|
23
|
+
types = registry.get_types(edge_types)
|
|
24
|
+
self.graph = Graph(edge_types=types)
|
|
25
|
+
|
|
26
|
+
def get_attributes(self, element: Union[Node, Edge]) -> Dict[str, str]:
|
|
27
|
+
attributes = {}
|
|
28
|
+
if element.proxy:
|
|
29
|
+
for prop, values in self.exportable_fields(element.proxy):
|
|
30
|
+
attributes[prop.name] = prop.type.join(values)
|
|
31
|
+
return attributes
|
|
32
|
+
|
|
33
|
+
def write(self, proxy: E, extra: Optional[List[str]] = None) -> None:
|
|
34
|
+
self.graph.add(proxy)
|
|
35
|
+
self.write_graph()
|
|
36
|
+
|
|
37
|
+
def finalize(self) -> None:
|
|
38
|
+
self.finalize_graph()
|
|
39
|
+
self.graph.flush()
|
|
40
|
+
|
|
41
|
+
def write_graph(self) -> None:
|
|
42
|
+
pass
|
|
43
|
+
|
|
44
|
+
def finalize_graph(self) -> None:
|
|
45
|
+
pass
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class NXGraphExporter(GraphExporter):
|
|
49
|
+
"""Write to NetworkX data structure, which in turn can be exported
|
|
50
|
+
to the file formats for Gephi (GEXF) and D3."""
|
|
51
|
+
|
|
52
|
+
def __init__(
|
|
53
|
+
self, fh: TextIO, edge_types: Iterable[str] = DEFAULT_EDGE_TYPES
|
|
54
|
+
) -> None:
|
|
55
|
+
super(NXGraphExporter, self).__init__(edge_types=edge_types)
|
|
56
|
+
self.fh = fh
|
|
57
|
+
|
|
58
|
+
def finalize_graph(self) -> None:
|
|
59
|
+
"""Convert from FtM graph model to NetworkX directed graph."""
|
|
60
|
+
digraph = nx.MultiDiGraph()
|
|
61
|
+
|
|
62
|
+
for node in self.graph.iternodes():
|
|
63
|
+
attributes = self.get_attributes(node)
|
|
64
|
+
attributes["schema"] = node.type.name
|
|
65
|
+
if node.caption is not None:
|
|
66
|
+
attributes["label"] = node.caption
|
|
67
|
+
if node.is_entity and node.schema is not None:
|
|
68
|
+
attributes["schema"] = node.schema.name
|
|
69
|
+
digraph.add_node(node.id, **attributes)
|
|
70
|
+
|
|
71
|
+
for edge in self.graph.iteredges():
|
|
72
|
+
attributes = self.get_attributes(edge)
|
|
73
|
+
attributes["schema"] = edge.type_name
|
|
74
|
+
attributes["weight"] = str(edge.weight)
|
|
75
|
+
digraph.add_edge(edge.source_id, edge.target_id, key=edge.id, **attributes)
|
|
76
|
+
|
|
77
|
+
for line in generate_gexf(digraph, prettyprint=True):
|
|
78
|
+
self.fh.write(line)
|
|
79
|
+
self.fh.write("\n")
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import json
|
|
3
|
+
import logging
|
|
4
|
+
from typing import Any, Dict, Iterable, List, Optional, Set, TextIO
|
|
5
|
+
import stringcase # type: ignore
|
|
6
|
+
|
|
7
|
+
from followthemoney.export.csv import CSVMixin, CSVWriter
|
|
8
|
+
from followthemoney.export.graph import GraphExporter, DEFAULT_EDGE_TYPES
|
|
9
|
+
from followthemoney.graph import Edge, Node
|
|
10
|
+
from followthemoney.schema import Schema
|
|
11
|
+
from followthemoney.util import PathLike
|
|
12
|
+
|
|
13
|
+
log = logging.getLogger(__name__)
|
|
14
|
+
NEO4J_ADMIN_PATH = os.environ.get("NEO4J_ADMIN_PATH", "neo4j-admin")
|
|
15
|
+
NEO4J_DATABASE_NAME = os.environ.get("NEO4J_DATABASE_NAME", "graph.db")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class Neo4JCSVExporter(CSVMixin, GraphExporter):
|
|
19
|
+
def __init__(
|
|
20
|
+
self,
|
|
21
|
+
directory: PathLike,
|
|
22
|
+
extra: Optional[List[str]] = None,
|
|
23
|
+
edge_types: Iterable[str] = DEFAULT_EDGE_TYPES,
|
|
24
|
+
) -> None:
|
|
25
|
+
super(Neo4JCSVExporter, self).__init__(edge_types=edge_types)
|
|
26
|
+
self._configure(directory, extra=extra)
|
|
27
|
+
|
|
28
|
+
self.links_handler, self.links_writer = self._open_csv_file("_links")
|
|
29
|
+
self.links_writer.writerow([":TYPE", ":START_ID", ":END_ID", "weight"])
|
|
30
|
+
|
|
31
|
+
self.nodes_handler, self.nodes_writer = self._open_csv_file("_nodes")
|
|
32
|
+
self.nodes_writer.writerow(["id:ID", ":LABEL", "caption"])
|
|
33
|
+
self.nodes_seen: Set[str] = set()
|
|
34
|
+
|
|
35
|
+
def _write_header(self, writer: CSVWriter, schema: Schema) -> None:
|
|
36
|
+
headers = []
|
|
37
|
+
if not schema.edge:
|
|
38
|
+
headers = ["id:ID", ":LABEL", "caption"]
|
|
39
|
+
else:
|
|
40
|
+
headers = ["id", ":TYPE", ":START_ID", ":END_ID"]
|
|
41
|
+
|
|
42
|
+
headers.extend(self.extra)
|
|
43
|
+
for prop in self.exportable_properties(schema):
|
|
44
|
+
headers.append(prop.name)
|
|
45
|
+
writer.writerow(headers)
|
|
46
|
+
|
|
47
|
+
def write_graph(self, extra: Optional[List[str]] = None) -> None:
|
|
48
|
+
extra_ = extra or []
|
|
49
|
+
for node in self.graph.iternodes():
|
|
50
|
+
self.write_node(node, extra_)
|
|
51
|
+
|
|
52
|
+
for edge in self.graph.iteredges():
|
|
53
|
+
self.write_edge(edge, extra_)
|
|
54
|
+
|
|
55
|
+
self.graph.flush()
|
|
56
|
+
|
|
57
|
+
def write_node(self, node: Node, extra: List[str]) -> None:
|
|
58
|
+
if node.id is None:
|
|
59
|
+
return None
|
|
60
|
+
if not node.is_entity and node.id not in self.nodes_seen:
|
|
61
|
+
row = [node.id, node.type.name, node.caption]
|
|
62
|
+
self.nodes_writer.writerow(row)
|
|
63
|
+
self.nodes_seen.add(node.id)
|
|
64
|
+
if node.proxy is not None and node.schema is not None:
|
|
65
|
+
label = ";".join(node.schema.names)
|
|
66
|
+
cells = [node.id, label, node.caption]
|
|
67
|
+
cells.extend(extra or [])
|
|
68
|
+
for prop, values in self.exportable_fields(node.proxy):
|
|
69
|
+
cells.append(prop.type.join(values))
|
|
70
|
+
writer = self._get_writer(node.schema)
|
|
71
|
+
writer.writerow(cells)
|
|
72
|
+
|
|
73
|
+
def write_edge(self, edge: Edge, extra: List[str]) -> None:
|
|
74
|
+
if edge.prop is not None:
|
|
75
|
+
type_ = stringcase.constcase(edge.prop.name)
|
|
76
|
+
row = [type_, edge.source_id, edge.target_id, edge.weight]
|
|
77
|
+
self.links_writer.writerow(row)
|
|
78
|
+
if edge.proxy is not None:
|
|
79
|
+
proxy = edge.proxy
|
|
80
|
+
type_ = stringcase.constcase(proxy.schema.name)
|
|
81
|
+
# That potentially may lead to multiple edges with same id
|
|
82
|
+
cells = [proxy.id, type_, edge.source_id, edge.target_id]
|
|
83
|
+
cells.extend(extra or [])
|
|
84
|
+
|
|
85
|
+
for prop, values in self.exportable_fields(edge.proxy):
|
|
86
|
+
cells.append(prop.type.join(values))
|
|
87
|
+
|
|
88
|
+
writer = self._get_writer(proxy.schema)
|
|
89
|
+
writer.writerow(cells)
|
|
90
|
+
|
|
91
|
+
def finalize_graph(self) -> None:
|
|
92
|
+
script_path = self.directory.joinpath("neo4j_import.sh")
|
|
93
|
+
with open(script_path, mode="w") as fp:
|
|
94
|
+
cmd = "{} import --id-type=STRING --database={} \\\n"
|
|
95
|
+
fp.write(cmd.format(NEO4J_ADMIN_PATH, NEO4J_DATABASE_NAME))
|
|
96
|
+
fp.write("\t--multiline-fields=true \\\n")
|
|
97
|
+
cmd = "\t--relationships={} \\\n"
|
|
98
|
+
fp.write(cmd.format(os.path.basename(self.links_handler.name)))
|
|
99
|
+
cmd = "\t--nodes={} \\\n"
|
|
100
|
+
fp.write(cmd.format(os.path.basename(self.nodes_handler.name)))
|
|
101
|
+
|
|
102
|
+
for schema, (handle, writer) in self.handles.items():
|
|
103
|
+
file_name = os.path.basename(handle.name)
|
|
104
|
+
if schema.edge:
|
|
105
|
+
cmd = "\t--relationships={} \\\n"
|
|
106
|
+
fp.write(cmd.format(file_name))
|
|
107
|
+
else:
|
|
108
|
+
cmd = "\t--nodes={} \\\n"
|
|
109
|
+
fp.write(cmd.format(file_name))
|
|
110
|
+
|
|
111
|
+
self.links_handler.close()
|
|
112
|
+
self.nodes_handler.close()
|
|
113
|
+
self.close()
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
class CypherGraphExporter(GraphExporter):
|
|
117
|
+
"""Cypher query format, used for import to Neo4J. This is a bit like
|
|
118
|
+
writing SQL with individual statements - so for large datasets it
|
|
119
|
+
might be a better idea to do a CSV-based import."""
|
|
120
|
+
|
|
121
|
+
# https://www.opencypher.org/
|
|
122
|
+
# MATCH (n) DETACH DELETE n;
|
|
123
|
+
|
|
124
|
+
def __init__(self, fh: TextIO, edge_types: Iterable[str] = DEFAULT_EDGE_TYPES):
|
|
125
|
+
super(CypherGraphExporter, self).__init__(edge_types=edge_types)
|
|
126
|
+
self.fh = fh
|
|
127
|
+
self.proxy_nodes: Set[str] = set()
|
|
128
|
+
|
|
129
|
+
def _to_map(self, data: Dict[str, Any]) -> str:
|
|
130
|
+
values = []
|
|
131
|
+
for key, value in data.items():
|
|
132
|
+
if value:
|
|
133
|
+
value = "%s: %s" % (key, json.dumps(value))
|
|
134
|
+
values.append(value)
|
|
135
|
+
return ", ".join(values)
|
|
136
|
+
|
|
137
|
+
def write_graph(self) -> None:
|
|
138
|
+
"""Export queries for each graph element."""
|
|
139
|
+
for node in self.graph.iternodes():
|
|
140
|
+
if node.value in self.proxy_nodes:
|
|
141
|
+
continue
|
|
142
|
+
if node.id is None:
|
|
143
|
+
continue
|
|
144
|
+
if node.proxy is not None:
|
|
145
|
+
self.proxy_nodes.add(node.value)
|
|
146
|
+
attributes = self.get_attributes(node)
|
|
147
|
+
attributes["id"] = node.id
|
|
148
|
+
if node.caption is not None:
|
|
149
|
+
attributes["caption"] = node.caption
|
|
150
|
+
if node.schema:
|
|
151
|
+
labels = list(node.schema.names)
|
|
152
|
+
else:
|
|
153
|
+
labels = [node.type.name]
|
|
154
|
+
cypher = "MERGE (p { %(id)s }) " "SET p += { %(map)s } SET p :%(label)s;\n"
|
|
155
|
+
self.fh.write(
|
|
156
|
+
cypher
|
|
157
|
+
% {
|
|
158
|
+
"id": self._to_map({"id": node.id}),
|
|
159
|
+
"map": self._to_map(attributes),
|
|
160
|
+
"label": ":".join(labels),
|
|
161
|
+
}
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
for edge in self.graph.iteredges():
|
|
165
|
+
attributes = self.get_attributes(edge)
|
|
166
|
+
attributes["id"] = edge.id
|
|
167
|
+
attributes["weight"] = str(edge.weight)
|
|
168
|
+
cypher = (
|
|
169
|
+
"MATCH (s { %(source)s }), (t { %(target)s }) "
|
|
170
|
+
"MERGE (s)-[:%(type)s { %(map)s }]->(t);\n"
|
|
171
|
+
)
|
|
172
|
+
self.fh.write(
|
|
173
|
+
cypher
|
|
174
|
+
% {
|
|
175
|
+
"source": self._to_map({"id": edge.source_id}),
|
|
176
|
+
"target": self._to_map({"id": edge.target_id}),
|
|
177
|
+
"type": stringcase.constcase(edge.type_name),
|
|
178
|
+
"map": self._to_map(attributes),
|
|
179
|
+
}
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
self.graph.flush()
|