followthemoney 1.3.6__py3-none-any.whl → 3.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (186) hide show
  1. followthemoney/__init__.py +5 -3
  2. followthemoney/cli/__init__.py +17 -0
  3. followthemoney/cli/aggregate.py +56 -0
  4. followthemoney/cli/cli.py +88 -0
  5. followthemoney/cli/exports.py +121 -0
  6. followthemoney/cli/mapping.py +85 -0
  7. followthemoney/cli/sieve.py +67 -0
  8. followthemoney/cli/util.py +142 -0
  9. followthemoney/compare.py +132 -55
  10. followthemoney/exc.py +19 -6
  11. followthemoney/export/common.py +29 -0
  12. followthemoney/export/csv.py +82 -0
  13. followthemoney/export/excel.py +75 -0
  14. followthemoney/export/graph.py +79 -0
  15. followthemoney/export/neo4j.py +182 -0
  16. followthemoney/export/rdf.py +26 -0
  17. followthemoney/graph.py +308 -0
  18. followthemoney/helpers.py +212 -0
  19. followthemoney/mapping/__init__.py +1 -1
  20. followthemoney/mapping/csv.py +67 -35
  21. followthemoney/mapping/entity.py +116 -44
  22. followthemoney/mapping/property.py +90 -44
  23. followthemoney/mapping/query.py +27 -19
  24. followthemoney/mapping/source.py +15 -5
  25. followthemoney/mapping/sql.py +75 -61
  26. followthemoney/messages.py +13 -7
  27. followthemoney/model.py +108 -56
  28. followthemoney/namespace.py +119 -0
  29. followthemoney/offshore.py +48 -0
  30. followthemoney/ontology.py +77 -0
  31. followthemoney/property.py +204 -71
  32. followthemoney/proxy.py +455 -118
  33. followthemoney/rdf.py +9 -0
  34. followthemoney/schema/Address.yaml +78 -0
  35. followthemoney/schema/Airplane.yaml +17 -10
  36. followthemoney/schema/Analyzable.yaml +54 -0
  37. followthemoney/schema/Article.yaml +16 -0
  38. followthemoney/schema/Assessment.yaml +32 -0
  39. followthemoney/schema/Asset.yaml +10 -4
  40. followthemoney/schema/Associate.yaml +41 -0
  41. followthemoney/schema/Audio.yaml +24 -0
  42. followthemoney/schema/BankAccount.yaml +53 -9
  43. followthemoney/schema/Call.yaml +48 -0
  44. followthemoney/schema/CallForTenders.yaml +117 -0
  45. followthemoney/schema/Company.yaml +37 -12
  46. followthemoney/schema/Contract.yaml +41 -7
  47. followthemoney/schema/ContractAward.yaml +30 -11
  48. followthemoney/schema/CourtCase.yaml +16 -10
  49. followthemoney/schema/CourtCaseParty.yaml +17 -6
  50. followthemoney/schema/CryptoWallet.yaml +48 -0
  51. followthemoney/schema/Debt.yaml +37 -0
  52. followthemoney/schema/Directorship.yaml +17 -4
  53. followthemoney/schema/Document.yaml +72 -139
  54. followthemoney/schema/Documentation.yml +38 -0
  55. followthemoney/schema/EconomicActivity.yaml +32 -17
  56. followthemoney/schema/Email.yaml +76 -0
  57. followthemoney/schema/Employment.yaml +39 -0
  58. followthemoney/schema/Event.yaml +35 -3
  59. followthemoney/schema/Family.yaml +41 -0
  60. followthemoney/schema/Folder.yaml +13 -0
  61. followthemoney/schema/HyperText.yaml +21 -0
  62. followthemoney/schema/Identification.yaml +40 -0
  63. followthemoney/schema/Image.yaml +25 -0
  64. followthemoney/schema/Interest.yaml +3 -6
  65. followthemoney/schema/Interval.yaml +56 -5
  66. followthemoney/schema/LegalEntity.yaml +81 -20
  67. followthemoney/schema/License.yaml +7 -3
  68. followthemoney/schema/Membership.yaml +19 -4
  69. followthemoney/schema/Mention.yaml +54 -0
  70. followthemoney/schema/Message.yaml +73 -0
  71. followthemoney/schema/Note.yaml +23 -0
  72. followthemoney/schema/Occupancy.yaml +40 -0
  73. followthemoney/schema/Organization.yaml +38 -3
  74. followthemoney/schema/Ownership.yaml +16 -4
  75. followthemoney/schema/Package.yaml +17 -0
  76. followthemoney/schema/Page.yaml +43 -0
  77. followthemoney/schema/Pages.yaml +23 -0
  78. followthemoney/schema/Passport.yaml +15 -17
  79. followthemoney/schema/Payment.yaml +38 -7
  80. followthemoney/schema/Person.yaml +61 -5
  81. followthemoney/schema/PlainText.yaml +17 -0
  82. followthemoney/schema/Position.yaml +50 -0
  83. followthemoney/schema/Post.yaml +42 -0
  84. followthemoney/schema/Project.yaml +27 -0
  85. followthemoney/schema/ProjectParticipant.yaml +36 -0
  86. followthemoney/schema/PublicBody.yaml +14 -3
  87. followthemoney/schema/RealEstate.yaml +19 -3
  88. followthemoney/schema/Representation.yaml +17 -6
  89. followthemoney/schema/Sanction.yaml +44 -20
  90. followthemoney/schema/Security.yaml +59 -0
  91. followthemoney/schema/Similar.yaml +37 -0
  92. followthemoney/schema/Succession.yaml +36 -0
  93. followthemoney/schema/Table.yaml +32 -0
  94. followthemoney/schema/TaxRoll.yaml +27 -9
  95. followthemoney/schema/Thing.yaml +69 -13
  96. followthemoney/schema/Trip.yaml +42 -0
  97. followthemoney/schema/UnknownLink.yaml +17 -6
  98. followthemoney/schema/UserAccount.yaml +44 -0
  99. followthemoney/schema/Value.yaml +5 -1
  100. followthemoney/schema/Vehicle.yaml +25 -8
  101. followthemoney/schema/Vessel.yaml +18 -10
  102. followthemoney/schema/Video.yaml +20 -0
  103. followthemoney/schema/Workbook.yaml +18 -0
  104. followthemoney/schema.py +406 -135
  105. followthemoney/translations/ar/LC_MESSAGES/followthemoney.mo +0 -0
  106. followthemoney/translations/ar/LC_MESSAGES/followthemoney.po +2900 -787
  107. followthemoney/translations/bs/LC_MESSAGES/followthemoney.mo +0 -0
  108. followthemoney/translations/bs/LC_MESSAGES/followthemoney.po +2108 -520
  109. followthemoney/translations/de/LC_MESSAGES/followthemoney.mo +0 -0
  110. followthemoney/translations/de/LC_MESSAGES/followthemoney.po +2902 -782
  111. followthemoney/translations/es/LC_MESSAGES/followthemoney.mo +0 -0
  112. followthemoney/translations/es/LC_MESSAGES/followthemoney.po +2893 -779
  113. followthemoney/translations/fr/LC_MESSAGES/followthemoney.mo +0 -0
  114. followthemoney/translations/fr/LC_MESSAGES/followthemoney.po +4362 -0
  115. followthemoney/translations/fr/followthemoney.po +3861 -0
  116. followthemoney/translations/messages.pot +3021 -725
  117. followthemoney/translations/nb/LC_MESSAGES/followthemoney.mo +0 -0
  118. followthemoney/translations/nb/LC_MESSAGES/followthemoney.po +3778 -0
  119. followthemoney/translations/nl/LC_MESSAGES/followthemoney.mo +0 -0
  120. followthemoney/translations/nl/LC_MESSAGES/followthemoney.po +3837 -0
  121. followthemoney/translations/pt_BR/LC_MESSAGES/followthemoney.mo +0 -0
  122. followthemoney/translations/pt_BR/LC_MESSAGES/followthemoney.po +3784 -0
  123. followthemoney/translations/ru/LC_MESSAGES/followthemoney.mo +0 -0
  124. followthemoney/translations/ru/LC_MESSAGES/followthemoney.po +2837 -539
  125. followthemoney/translations/ru/followthemoney.po +4221 -0
  126. followthemoney/translations/tr/LC_MESSAGES/followthemoney.mo +0 -0
  127. followthemoney/translations/tr/LC_MESSAGES/followthemoney.po +2073 -491
  128. followthemoney/types/__init__.py +35 -17
  129. followthemoney/types/address.py +41 -21
  130. followthemoney/types/checksum.py +25 -0
  131. followthemoney/types/common.py +233 -88
  132. followthemoney/types/country.py +89 -56
  133. followthemoney/types/date.py +59 -76
  134. followthemoney/types/email.py +66 -35
  135. followthemoney/types/entity.py +66 -13
  136. followthemoney/types/gender.py +66 -0
  137. followthemoney/types/iban.py +47 -28
  138. followthemoney/types/identifier.py +49 -22
  139. followthemoney/types/ip.py +35 -21
  140. followthemoney/types/json.py +58 -0
  141. followthemoney/types/language.py +124 -37
  142. followthemoney/types/mimetype.py +44 -0
  143. followthemoney/types/name.py +56 -12
  144. followthemoney/types/number.py +30 -0
  145. followthemoney/types/phone.py +92 -34
  146. followthemoney/types/registry.py +52 -0
  147. followthemoney/types/string.py +43 -0
  148. followthemoney/types/topic.py +94 -0
  149. followthemoney/types/url.py +39 -17
  150. followthemoney/util.py +139 -45
  151. followthemoney-3.8.0.dist-info/METADATA +153 -0
  152. followthemoney-3.8.0.dist-info/RECORD +157 -0
  153. {followthemoney-1.3.6.dist-info → followthemoney-3.8.0.dist-info}/WHEEL +1 -2
  154. followthemoney-3.8.0.dist-info/entry_points.txt +17 -0
  155. followthemoney-1.3.6.dist-info/LICENSE.txt → followthemoney-3.8.0.dist-info/licenses/LICENSE +1 -1
  156. followthemoney/link.py +0 -75
  157. followthemoney/schema/Associate.yml +0 -19
  158. followthemoney/schema/Family.yml +0 -19
  159. followthemoney/schema/Land.yml +0 -9
  160. followthemoney/schema/Relationship.yaml +0 -26
  161. followthemoney/types/domain.py +0 -50
  162. followthemoney-1.3.6.dist-info/DESCRIPTION.rst +0 -3
  163. followthemoney-1.3.6.dist-info/METADATA +0 -39
  164. followthemoney-1.3.6.dist-info/RECORD +0 -108
  165. followthemoney-1.3.6.dist-info/entry_points.txt +0 -3
  166. followthemoney-1.3.6.dist-info/metadata.json +0 -1
  167. followthemoney-1.3.6.dist-info/namespace_packages.txt +0 -1
  168. followthemoney-1.3.6.dist-info/top_level.txt +0 -3
  169. ns/ontology.py +0 -128
  170. tests/types/test_addresses.py +0 -24
  171. tests/types/test_common.py +0 -27
  172. tests/types/test_countries.py +0 -21
  173. tests/types/test_dates.py +0 -72
  174. tests/types/test_domains.py +0 -23
  175. tests/types/test_emails.py +0 -30
  176. tests/types/test_entity.py +0 -16
  177. tests/types/test_iban.py +0 -109
  178. tests/types/test_identifiers.py +0 -25
  179. tests/types/test_ip.py +0 -26
  180. tests/types/test_languages.py +0 -20
  181. tests/types/test_names.py +0 -33
  182. tests/types/test_phones.py +0 -24
  183. tests/types/test_registry.py +0 -14
  184. tests/types/test_urls.py +0 -23
  185. {ns → followthemoney/export}/__init__.py +0 -0
  186. /tests/types/__init__.py → /followthemoney/py.typed +0 -0
followthemoney/compare.py CHANGED
@@ -1,64 +1,141 @@
1
+ import math
1
2
  import itertools
2
- from Levenshtein import jaro
3
+ from typing import Dict, Generator, Iterable, List, Optional
4
+ import fingerprints
3
5
  from normality import normalize
4
- from followthemoney.types import registry
5
- from followthemoney.util import dampen
6
6
  from followthemoney.exc import InvalidData
7
+ from followthemoney.model import Model
8
+ from followthemoney.types import registry
9
+ from followthemoney.proxy import EntityProxy
10
+ from followthemoney.types.common import PropertyType
7
11
 
8
- # OK, Here's the plan: we have to find a way to get user judgements
9
- # on as many of these matches as we can, then build a regression
10
- # model which properly weights the value of a matching property
11
- # based upon it's type.
12
- FP_WEIGHT = 0.6
13
- MATCH_WEIGHTS = {
14
- registry.text: 0,
15
- registry.name: 0, # because we already compare names
16
- registry.identifier: 0.4,
17
- registry.url: 0.1,
18
- registry.email: 0.3,
19
- registry.ip: 0.1,
20
- registry.iban: 0.3,
21
- registry.address: 0.2,
22
- registry.date: 0.3,
23
- registry.phone: 0.1,
24
- registry.country: 0.1,
25
- registry.language: 0.1,
12
+
13
+ # Compare weights come from the glm-bernouli model in followthemoney-predict
14
+ Weights = Dict[Optional[PropertyType], float]
15
+ Scores = Dict[PropertyType, Optional[float]]
16
+ COMPARE_WEIGHTS: Weights = {
17
+ registry.name: 12.275729155073371,
18
+ registry.country: 1.0494517476987815,
19
+ registry.date: 6.960245940274218,
20
+ registry.identifier: 5.2209896558064175,
21
+ registry.address: 6.456137299747168,
22
+ registry.phone: 3.538892687331418,
23
+ registry.email: 14.115925628770384,
24
+ registry.iban: 0.019140301711998726,
25
+ registry.url: 3.211995327345834,
26
+ None: -11.91521189545115,
26
27
  }
27
28
 
28
29
 
29
- def compare(model, left, right):
30
- """Compare two entities and return number between 0 and 1.
31
- Returned number indicates probability that two entities are the same.
32
- """
33
- left = model.get_proxy(left)
34
- right = model.get_proxy(right)
35
- if right.schema not in list(left.schema.matchable_schemata):
36
- return 0
37
- schema = model.common_schema(left.schema, right.schema)
38
- score = compare_names(left, right) * FP_WEIGHT
39
- for name, prop in schema.properties.items():
40
- weight = MATCH_WEIGHTS.get(prop.type, 0)
41
- if weight == 0:
42
- continue
30
+ def compare_scores(model: Model, left: EntityProxy, right: EntityProxy) -> Scores:
31
+ """Compare two entities and return a match score for each property."""
32
+ try:
33
+ model.common_schema(left.schema, right.schema)
34
+ except InvalidData:
35
+ return {}
36
+ scores: Scores = {}
37
+ left_inv = left.get_type_inverted(matchable=True)
38
+ right_inv = right.get_type_inverted(matchable=True)
39
+ left_groups = set(left_inv.keys())
40
+ right_groups = set(right_inv.keys())
41
+ for group_name in left_groups.intersection(right_groups):
42
+ group = registry.groups[group_name]
43
43
  try:
44
- left_values = left.get(name)
45
- right_values = right.get(name)
46
- except InvalidData:
47
- continue
48
-
49
- if not len(left_values) or not len(right_values):
50
- continue
51
- prop_score = prop.type.compare_sets(left_values, right_values)
52
- score += (prop_score * weight)
53
- return max(0.0, min(1.0, score)) * 0.9
54
-
55
-
56
- def compare_names(left, right):
57
- result = 0
58
- left_list = [normalize(n, latinize=True) for n in left.names]
59
- right_list = [normalize(n, latinize=True) for n in right.names]
60
- for (left, right) in itertools.product(left_list, right_list):
61
- similarity = jaro(left, right)
62
- score = similarity * dampen(2, 20, min(left, right, key=len))
63
- result = max(result, score)
44
+ if group == registry.name:
45
+ score = compare_names(left, right)
46
+ elif group == registry.country:
47
+ score = compare_countries(left, right)
48
+ else:
49
+ score = compare_group(
50
+ group, left_inv[group_name], right_inv[group_name]
51
+ )
52
+ scores[group] = score
53
+ except ValueError:
54
+ pass
55
+ for group_name in left_groups.symmetric_difference(right_groups):
56
+ group = registry.groups[group_name]
57
+ scores[group] = None
58
+ return scores
59
+
60
+
61
+ def _compare(scores: Scores, weights: Weights, n_std: int = 1) -> float:
62
+ if not scores or not any(scores.values()):
63
+ return 0.0
64
+ prob = 0.0
65
+ for field, weight in weights.items():
66
+ if field:
67
+ prob += weight * (scores.get(field) or 0.0)
68
+ else:
69
+ prob += weight
70
+ return 1.0 / (1.0 + math.exp(-prob))
71
+
72
+
73
+ def compare(
74
+ model: Model,
75
+ left: EntityProxy,
76
+ right: EntityProxy,
77
+ weights: Weights = COMPARE_WEIGHTS,
78
+ ) -> float:
79
+ """Compare two entities and return a match score."""
80
+ scores = compare_scores(model, left, right)
81
+ return _compare(scores, weights)
82
+
83
+
84
+ def _normalize_names(names: Iterable[str]) -> Generator[str, None, None]:
85
+ """Generate a sequence of comparable names for an entity. This also
86
+ generates a `fingerprint`, i.e. a version of the name where all tokens
87
+ are sorted alphabetically, and some parts, such as company suffixes,
88
+ have been removed."""
89
+ seen = set()
90
+ for name in names:
91
+ plain = normalize(name, ascii=True)
92
+ if plain is not None and plain not in seen:
93
+ seen.add(plain)
94
+ yield plain
95
+ fp = fingerprints.generate(name)
96
+ if fp is not None and len(fp) > 6 and fp not in seen:
97
+ seen.add(fp)
98
+ yield fp
99
+
100
+
101
+ def compare_group(
102
+ group_type: PropertyType, left_values: List[str], right_values: List[str]
103
+ ) -> Optional[float]:
104
+ if not left_values and not right_values:
105
+ raise ValueError("At least one proxy must have property type: %s", group_type)
106
+ elif not left_values or not right_values:
107
+ return None
108
+ return group_type.compare_sets(left_values, right_values)
109
+
110
+
111
+ def compare_names(
112
+ left: EntityProxy, right: EntityProxy, max_names: int = 200
113
+ ) -> Optional[float]:
114
+ result = 0.0
115
+ left_list = list(itertools.islice(_normalize_names(left.names), max_names))
116
+ right_list = list(itertools.islice(_normalize_names(right.names), max_names))
117
+ if not left_list and not right_list:
118
+ raise ValueError("At least one proxy must have name properties")
119
+ elif not left_list or not right_list:
120
+ return None
121
+ for (left_val, right_val) in itertools.product(left_list, right_list):
122
+ similarity = registry.name.compare(left_val, right_val)
123
+ result = max(result, similarity)
124
+ if result == 1.0:
125
+ break
126
+ result *= min(
127
+ 1.0, 2 ** (-len(left_list) * len(right_list) / (max_names * max_names))
128
+ )
64
129
  return result
130
+
131
+
132
+ def compare_countries(left: EntityProxy, right: EntityProxy) -> Optional[float]:
133
+ left_countries = left.country_hints
134
+ right_countries = right.country_hints
135
+ if not left_countries and not right_countries:
136
+ raise ValueError("At least one proxy must have country properties")
137
+ elif not left_countries or not right_countries:
138
+ return None
139
+ intersection = left_countries.intersection(right_countries)
140
+ union = left_countries.union(right_countries)
141
+ return len(intersection) / float(len(union))
followthemoney/exc.py CHANGED
@@ -1,18 +1,31 @@
1
+ from typing import Dict, Optional, TypedDict
1
2
 
2
3
 
3
- class InvalidData(Exception):
4
+ class ErrorSpec(TypedDict, total=False):
5
+ properties: Dict[str, str]
6
+
7
+
8
+ class FollowTheMoneyException(Exception):
9
+ """Catch-all exception for errors emitted by this library."""
10
+
11
+ pass
12
+
13
+
14
+ class InvalidData(FollowTheMoneyException):
4
15
  """Schema validation errors will be caught by the API."""
5
16
 
6
- def __init__(self, errors):
7
- self.errors = errors
8
- super(InvalidData, self).__init__(repr(errors))
17
+ def __init__(self, message: str, errors: Optional[ErrorSpec] = None) -> None:
18
+ super(InvalidData, self).__init__(message)
19
+ self.errors: ErrorSpec = errors or {}
9
20
 
10
21
 
11
- class InvalidModel(Exception):
22
+ class InvalidModel(FollowTheMoneyException):
12
23
  """The schema model is not defined correctly."""
24
+
13
25
  pass
14
26
 
15
27
 
16
- class InvalidMapping(Exception):
28
+ class InvalidMapping(FollowTheMoneyException):
17
29
  """A data mapping was invalid."""
30
+
18
31
  pass
@@ -0,0 +1,29 @@
1
+ from typing import Generator, List, Optional, Tuple
2
+ from followthemoney.property import Property
3
+ from followthemoney.proxy import E
4
+ from followthemoney.schema import Schema
5
+ from followthemoney.types import registry
6
+
7
+
8
+ class Exporter(object):
9
+ def __init__(self, export_all: bool = False) -> None:
10
+ self.export_all = export_all
11
+
12
+ def exportable_properties(self, schema: Schema) -> Generator[Property, None, None]:
13
+ for prop in schema.sorted_properties:
14
+ if not self.export_all:
15
+ if prop.hidden or prop.type == registry.entity:
16
+ continue
17
+ yield prop
18
+
19
+ def exportable_fields(
20
+ self, proxy: E
21
+ ) -> Generator[Tuple[Property, List[str]], None, None]:
22
+ for prop in self.exportable_properties(proxy.schema):
23
+ yield prop, proxy.get(prop)
24
+
25
+ def write(self, proxy: E, extra: Optional[List[str]] = None) -> None:
26
+ raise NotImplementedError
27
+
28
+ def finalize(self) -> None:
29
+ pass
@@ -0,0 +1,82 @@
1
+ import csv
2
+
3
+ try:
4
+ from _csv import _writer as csv_writer
5
+ except ImportError:
6
+ # Python 3.8/3.9 work-around:
7
+ from _csv import writer as csv_writer # type: ignore
8
+
9
+ from io import TextIOWrapper
10
+ from pathlib import Path
11
+ from typing import Dict, List, Optional, Tuple
12
+
13
+ from followthemoney.proxy import E
14
+ from followthemoney.export.common import Exporter
15
+ from followthemoney.schema import Schema
16
+ from followthemoney.util import PathLike
17
+
18
+ CSVWriter = csv_writer
19
+
20
+
21
+ class CSVMixin(object):
22
+ def _configure(
23
+ self,
24
+ directory: PathLike,
25
+ extra: Optional[List[str]] = None,
26
+ ) -> None:
27
+ self.directory = Path(directory)
28
+ self.extra = extra or []
29
+ self.handles: Dict[Schema, Tuple[TextIOWrapper, CSVWriter]] = {}
30
+
31
+ def _open_csv_file(self, name: str) -> Tuple[TextIOWrapper, CSVWriter]:
32
+ self.directory.mkdir(parents=True, exist_ok=True)
33
+ file_path = self.directory.joinpath("{0}.csv".format(name))
34
+ handle = open(file_path, mode="w")
35
+ writer = csv.writer(handle, dialect=csv.unix_dialect)
36
+ return handle, writer
37
+
38
+ def _write_header(self, writer: CSVWriter, schema: Schema) -> None:
39
+ raise NotImplementedError
40
+
41
+ def _get_writer(self, schema: Schema) -> CSVWriter:
42
+ if schema not in self.handles:
43
+ handle, writer = self._open_csv_file(schema.name)
44
+ self.handles[schema] = (handle, writer)
45
+ self._write_header(writer, schema)
46
+ handle, writer = self.handles[schema]
47
+ return writer
48
+
49
+ def close(self) -> None:
50
+ for handle, _ in self.handles.values():
51
+ handle.close()
52
+
53
+
54
+ class CSVExporter(Exporter, CSVMixin):
55
+ def __init__(
56
+ self,
57
+ directory: PathLike,
58
+ export_all: bool = True,
59
+ extra: Optional[List[str]] = None,
60
+ ) -> None:
61
+ Exporter.__init__(self, export_all=export_all)
62
+ self._configure(directory, extra=extra)
63
+
64
+ def _write_header(self, writer: CSVWriter, schema: Schema) -> None:
65
+ headers = ["id"]
66
+ headers.extend(self.extra)
67
+ for prop in self.exportable_properties(schema):
68
+ # Not using label to make it more machine-readable:
69
+ headers.append(prop.name)
70
+ writer.writerow(headers)
71
+
72
+ def write(self, proxy: E, extra: Optional[List[str]] = None) -> None:
73
+ writer = self._get_writer(proxy.schema)
74
+ cells = [proxy.id]
75
+ cells.extend(extra or [])
76
+ for prop, values in self.exportable_fields(proxy):
77
+ cells.append(prop.type.join(values))
78
+
79
+ writer.writerow(cells)
80
+
81
+ def finalize(self) -> None:
82
+ self.close()
@@ -0,0 +1,75 @@
1
+ import logging
2
+ from io import BytesIO
3
+ from typing import Dict, List, Optional
4
+ from openpyxl import Workbook
5
+ from openpyxl.cell import WriteOnlyCell
6
+ from openpyxl.styles import Font, PatternFill
7
+ from openpyxl.worksheet.worksheet import Worksheet
8
+ from openpyxl.utils.exceptions import IllegalCharacterError
9
+
10
+ from followthemoney.export.common import Exporter
11
+ from followthemoney.proxy import E
12
+ from followthemoney.schema import Schema
13
+ from followthemoney.util import PathLike, sanitize_text
14
+
15
+ log = logging.getLogger(__name__)
16
+
17
+
18
+ class ExcelWriter(object):
19
+ HEADER_FONT = Font(bold=True, color="FFFFFF")
20
+ HEADER_FILL = PatternFill(
21
+ start_color="982022", end_color="982022", fill_type="solid"
22
+ )
23
+
24
+ def __init__(self) -> None:
25
+ self.workbook = Workbook(write_only=True)
26
+
27
+ def make_sheet(self, title: str, headers: List[str]) -> Worksheet:
28
+ sheet: Worksheet = self.workbook.create_sheet(title=title)
29
+ sheet.freeze_panes = "A2"
30
+ sheet.sheet_properties.filterMode = True
31
+ cells = []
32
+ for header in headers:
33
+ header_ = sanitize_text(header)
34
+ cell = WriteOnlyCell(sheet, value=header_)
35
+ cell.font = self.HEADER_FONT
36
+ cell.fill = self.HEADER_FILL
37
+ cells.append(cell)
38
+ sheet.append(cells)
39
+ return sheet
40
+
41
+ def get_bytesio(self) -> BytesIO:
42
+ buffer = BytesIO()
43
+ self.workbook.save(buffer)
44
+ buffer.seek(0)
45
+ return buffer
46
+
47
+
48
+ class ExcelExporter(ExcelWriter, Exporter):
49
+ def __init__(self, file_path: PathLike, extra: Optional[List[str]] = None):
50
+ ExcelWriter.__init__(self)
51
+ Exporter.__init__(self)
52
+ self.file_path = file_path
53
+ self.extra = extra or []
54
+ self.sheets: Dict[Schema, Worksheet] = {}
55
+
56
+ def write(self, proxy: E, extra: Optional[List[str]] = None) -> None:
57
+ if proxy.schema not in self.sheets:
58
+ headers = ["ID"]
59
+ headers.extend(self.extra)
60
+ for prop in self.exportable_properties(proxy.schema):
61
+ headers.append(prop.label)
62
+ sheet = self.make_sheet(proxy.schema.plural, headers)
63
+ self.sheets[proxy.schema] = sheet
64
+ sheet = self.sheets[proxy.schema]
65
+ try:
66
+ cells = [proxy.id]
67
+ cells.extend(extra or [])
68
+ for prop, values in self.exportable_fields(proxy):
69
+ cells.append(prop.type.join(values))
70
+ sheet.append(cells)
71
+ except IllegalCharacterError as ice:
72
+ log.error("Invalid text for Excel export: %s", ice)
73
+
74
+ def finalize(self) -> None:
75
+ self.workbook.save(self.file_path)
@@ -0,0 +1,79 @@
1
+ from typing import Dict, Iterable, List, Optional, TextIO, Union
2
+ import networkx as nx # type: ignore
3
+ from networkx.readwrite.gexf import generate_gexf # type: ignore
4
+
5
+ from followthemoney.graph import Edge, Graph, Node
6
+ from followthemoney.proxy import E
7
+ from followthemoney.types import registry
8
+ from followthemoney.export.common import Exporter
9
+
10
+ DEFAULT_EDGE_TYPES = (registry.entity.name,)
11
+
12
+
13
+ def edge_types() -> List[str]:
14
+ return [t.name for t in registry.matchable if t is not None]
15
+
16
+
17
+ class GraphExporter(Exporter):
18
+ """Base functions for exporting a property graph from a stream
19
+ of entities."""
20
+
21
+ def __init__(self, edge_types: Iterable[str] = DEFAULT_EDGE_TYPES) -> None:
22
+ super(GraphExporter, self).__init__()
23
+ types = registry.get_types(edge_types)
24
+ self.graph = Graph(edge_types=types)
25
+
26
+ def get_attributes(self, element: Union[Node, Edge]) -> Dict[str, str]:
27
+ attributes = {}
28
+ if element.proxy:
29
+ for prop, values in self.exportable_fields(element.proxy):
30
+ attributes[prop.name] = prop.type.join(values)
31
+ return attributes
32
+
33
+ def write(self, proxy: E, extra: Optional[List[str]] = None) -> None:
34
+ self.graph.add(proxy)
35
+ self.write_graph()
36
+
37
+ def finalize(self) -> None:
38
+ self.finalize_graph()
39
+ self.graph.flush()
40
+
41
+ def write_graph(self) -> None:
42
+ pass
43
+
44
+ def finalize_graph(self) -> None:
45
+ pass
46
+
47
+
48
+ class NXGraphExporter(GraphExporter):
49
+ """Write to NetworkX data structure, which in turn can be exported
50
+ to the file formats for Gephi (GEXF) and D3."""
51
+
52
+ def __init__(
53
+ self, fh: TextIO, edge_types: Iterable[str] = DEFAULT_EDGE_TYPES
54
+ ) -> None:
55
+ super(NXGraphExporter, self).__init__(edge_types=edge_types)
56
+ self.fh = fh
57
+
58
+ def finalize_graph(self) -> None:
59
+ """Convert from FtM graph model to NetworkX directed graph."""
60
+ digraph = nx.MultiDiGraph()
61
+
62
+ for node in self.graph.iternodes():
63
+ attributes = self.get_attributes(node)
64
+ attributes["schema"] = node.type.name
65
+ if node.caption is not None:
66
+ attributes["label"] = node.caption
67
+ if node.is_entity and node.schema is not None:
68
+ attributes["schema"] = node.schema.name
69
+ digraph.add_node(node.id, **attributes)
70
+
71
+ for edge in self.graph.iteredges():
72
+ attributes = self.get_attributes(edge)
73
+ attributes["schema"] = edge.type_name
74
+ attributes["weight"] = str(edge.weight)
75
+ digraph.add_edge(edge.source_id, edge.target_id, key=edge.id, **attributes)
76
+
77
+ for line in generate_gexf(digraph, prettyprint=True):
78
+ self.fh.write(line)
79
+ self.fh.write("\n")
@@ -0,0 +1,182 @@
1
+ import os
2
+ import json
3
+ import logging
4
+ from typing import Any, Dict, Iterable, List, Optional, Set, TextIO
5
+ import stringcase # type: ignore
6
+
7
+ from followthemoney.export.csv import CSVMixin, CSVWriter
8
+ from followthemoney.export.graph import GraphExporter, DEFAULT_EDGE_TYPES
9
+ from followthemoney.graph import Edge, Node
10
+ from followthemoney.schema import Schema
11
+ from followthemoney.util import PathLike
12
+
13
+ log = logging.getLogger(__name__)
14
+ NEO4J_ADMIN_PATH = os.environ.get("NEO4J_ADMIN_PATH", "neo4j-admin")
15
+ NEO4J_DATABASE_NAME = os.environ.get("NEO4J_DATABASE_NAME", "graph.db")
16
+
17
+
18
+ class Neo4JCSVExporter(CSVMixin, GraphExporter):
19
+ def __init__(
20
+ self,
21
+ directory: PathLike,
22
+ extra: Optional[List[str]] = None,
23
+ edge_types: Iterable[str] = DEFAULT_EDGE_TYPES,
24
+ ) -> None:
25
+ super(Neo4JCSVExporter, self).__init__(edge_types=edge_types)
26
+ self._configure(directory, extra=extra)
27
+
28
+ self.links_handler, self.links_writer = self._open_csv_file("_links")
29
+ self.links_writer.writerow([":TYPE", ":START_ID", ":END_ID", "weight"])
30
+
31
+ self.nodes_handler, self.nodes_writer = self._open_csv_file("_nodes")
32
+ self.nodes_writer.writerow(["id:ID", ":LABEL", "caption"])
33
+ self.nodes_seen: Set[str] = set()
34
+
35
+ def _write_header(self, writer: CSVWriter, schema: Schema) -> None:
36
+ headers = []
37
+ if not schema.edge:
38
+ headers = ["id:ID", ":LABEL", "caption"]
39
+ else:
40
+ headers = ["id", ":TYPE", ":START_ID", ":END_ID"]
41
+
42
+ headers.extend(self.extra)
43
+ for prop in self.exportable_properties(schema):
44
+ headers.append(prop.name)
45
+ writer.writerow(headers)
46
+
47
+ def write_graph(self, extra: Optional[List[str]] = None) -> None:
48
+ extra_ = extra or []
49
+ for node in self.graph.iternodes():
50
+ self.write_node(node, extra_)
51
+
52
+ for edge in self.graph.iteredges():
53
+ self.write_edge(edge, extra_)
54
+
55
+ self.graph.flush()
56
+
57
+ def write_node(self, node: Node, extra: List[str]) -> None:
58
+ if node.id is None:
59
+ return None
60
+ if not node.is_entity and node.id not in self.nodes_seen:
61
+ row = [node.id, node.type.name, node.caption]
62
+ self.nodes_writer.writerow(row)
63
+ self.nodes_seen.add(node.id)
64
+ if node.proxy is not None and node.schema is not None:
65
+ label = ";".join(node.schema.names)
66
+ cells = [node.id, label, node.caption]
67
+ cells.extend(extra or [])
68
+ for prop, values in self.exportable_fields(node.proxy):
69
+ cells.append(prop.type.join(values))
70
+ writer = self._get_writer(node.schema)
71
+ writer.writerow(cells)
72
+
73
+ def write_edge(self, edge: Edge, extra: List[str]) -> None:
74
+ if edge.prop is not None:
75
+ type_ = stringcase.constcase(edge.prop.name)
76
+ row = [type_, edge.source_id, edge.target_id, edge.weight]
77
+ self.links_writer.writerow(row)
78
+ if edge.proxy is not None:
79
+ proxy = edge.proxy
80
+ type_ = stringcase.constcase(proxy.schema.name)
81
+ # That potentially may lead to multiple edges with same id
82
+ cells = [proxy.id, type_, edge.source_id, edge.target_id]
83
+ cells.extend(extra or [])
84
+
85
+ for prop, values in self.exportable_fields(edge.proxy):
86
+ cells.append(prop.type.join(values))
87
+
88
+ writer = self._get_writer(proxy.schema)
89
+ writer.writerow(cells)
90
+
91
+ def finalize_graph(self) -> None:
92
+ script_path = self.directory.joinpath("neo4j_import.sh")
93
+ with open(script_path, mode="w") as fp:
94
+ cmd = "{} import --id-type=STRING --database={} \\\n"
95
+ fp.write(cmd.format(NEO4J_ADMIN_PATH, NEO4J_DATABASE_NAME))
96
+ fp.write("\t--multiline-fields=true \\\n")
97
+ cmd = "\t--relationships={} \\\n"
98
+ fp.write(cmd.format(os.path.basename(self.links_handler.name)))
99
+ cmd = "\t--nodes={} \\\n"
100
+ fp.write(cmd.format(os.path.basename(self.nodes_handler.name)))
101
+
102
+ for schema, (handle, writer) in self.handles.items():
103
+ file_name = os.path.basename(handle.name)
104
+ if schema.edge:
105
+ cmd = "\t--relationships={} \\\n"
106
+ fp.write(cmd.format(file_name))
107
+ else:
108
+ cmd = "\t--nodes={} \\\n"
109
+ fp.write(cmd.format(file_name))
110
+
111
+ self.links_handler.close()
112
+ self.nodes_handler.close()
113
+ self.close()
114
+
115
+
116
+ class CypherGraphExporter(GraphExporter):
117
+ """Cypher query format, used for import to Neo4J. This is a bit like
118
+ writing SQL with individual statements - so for large datasets it
119
+ might be a better idea to do a CSV-based import."""
120
+
121
+ # https://www.opencypher.org/
122
+ # MATCH (n) DETACH DELETE n;
123
+
124
+ def __init__(self, fh: TextIO, edge_types: Iterable[str] = DEFAULT_EDGE_TYPES):
125
+ super(CypherGraphExporter, self).__init__(edge_types=edge_types)
126
+ self.fh = fh
127
+ self.proxy_nodes: Set[str] = set()
128
+
129
+ def _to_map(self, data: Dict[str, Any]) -> str:
130
+ values = []
131
+ for key, value in data.items():
132
+ if value:
133
+ value = "%s: %s" % (key, json.dumps(value))
134
+ values.append(value)
135
+ return ", ".join(values)
136
+
137
+ def write_graph(self) -> None:
138
+ """Export queries for each graph element."""
139
+ for node in self.graph.iternodes():
140
+ if node.value in self.proxy_nodes:
141
+ continue
142
+ if node.id is None:
143
+ continue
144
+ if node.proxy is not None:
145
+ self.proxy_nodes.add(node.value)
146
+ attributes = self.get_attributes(node)
147
+ attributes["id"] = node.id
148
+ if node.caption is not None:
149
+ attributes["caption"] = node.caption
150
+ if node.schema:
151
+ labels = list(node.schema.names)
152
+ else:
153
+ labels = [node.type.name]
154
+ cypher = "MERGE (p { %(id)s }) " "SET p += { %(map)s } SET p :%(label)s;\n"
155
+ self.fh.write(
156
+ cypher
157
+ % {
158
+ "id": self._to_map({"id": node.id}),
159
+ "map": self._to_map(attributes),
160
+ "label": ":".join(labels),
161
+ }
162
+ )
163
+
164
+ for edge in self.graph.iteredges():
165
+ attributes = self.get_attributes(edge)
166
+ attributes["id"] = edge.id
167
+ attributes["weight"] = str(edge.weight)
168
+ cypher = (
169
+ "MATCH (s { %(source)s }), (t { %(target)s }) "
170
+ "MERGE (s)-[:%(type)s { %(map)s }]->(t);\n"
171
+ )
172
+ self.fh.write(
173
+ cypher
174
+ % {
175
+ "source": self._to_map({"id": edge.source_id}),
176
+ "target": self._to_map({"id": edge.target_id}),
177
+ "type": stringcase.constcase(edge.type_name),
178
+ "map": self._to_map(attributes),
179
+ }
180
+ )
181
+
182
+ self.graph.flush()