followthemoney 1.3.7__py3-none-any.whl → 3.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (186) hide show
  1. followthemoney/__init__.py +5 -3
  2. followthemoney/cli/__init__.py +17 -0
  3. followthemoney/cli/aggregate.py +56 -0
  4. followthemoney/cli/cli.py +88 -0
  5. followthemoney/cli/exports.py +121 -0
  6. followthemoney/cli/mapping.py +85 -0
  7. followthemoney/cli/sieve.py +67 -0
  8. followthemoney/cli/util.py +142 -0
  9. followthemoney/compare.py +130 -60
  10. followthemoney/exc.py +19 -6
  11. followthemoney/export/common.py +29 -0
  12. followthemoney/export/csv.py +82 -0
  13. followthemoney/export/excel.py +75 -0
  14. followthemoney/export/graph.py +79 -0
  15. followthemoney/export/neo4j.py +182 -0
  16. followthemoney/export/rdf.py +26 -0
  17. followthemoney/graph.py +308 -0
  18. followthemoney/helpers.py +212 -0
  19. followthemoney/mapping/__init__.py +1 -1
  20. followthemoney/mapping/csv.py +67 -35
  21. followthemoney/mapping/entity.py +116 -44
  22. followthemoney/mapping/property.py +90 -44
  23. followthemoney/mapping/query.py +27 -19
  24. followthemoney/mapping/source.py +15 -5
  25. followthemoney/mapping/sql.py +75 -61
  26. followthemoney/messages.py +13 -7
  27. followthemoney/model.py +108 -56
  28. followthemoney/namespace.py +119 -0
  29. followthemoney/offshore.py +48 -0
  30. followthemoney/ontology.py +77 -0
  31. followthemoney/property.py +204 -71
  32. followthemoney/proxy.py +455 -118
  33. followthemoney/rdf.py +9 -0
  34. followthemoney/schema/Address.yaml +78 -0
  35. followthemoney/schema/Airplane.yaml +17 -10
  36. followthemoney/schema/Analyzable.yaml +54 -0
  37. followthemoney/schema/Article.yaml +16 -0
  38. followthemoney/schema/Assessment.yaml +32 -0
  39. followthemoney/schema/Asset.yaml +10 -4
  40. followthemoney/schema/Associate.yaml +41 -0
  41. followthemoney/schema/Audio.yaml +24 -0
  42. followthemoney/schema/BankAccount.yaml +53 -9
  43. followthemoney/schema/Call.yaml +48 -0
  44. followthemoney/schema/CallForTenders.yaml +117 -0
  45. followthemoney/schema/Company.yaml +37 -12
  46. followthemoney/schema/Contract.yaml +41 -7
  47. followthemoney/schema/ContractAward.yaml +30 -11
  48. followthemoney/schema/CourtCase.yaml +16 -10
  49. followthemoney/schema/CourtCaseParty.yaml +17 -6
  50. followthemoney/schema/CryptoWallet.yaml +48 -0
  51. followthemoney/schema/Debt.yaml +37 -0
  52. followthemoney/schema/Directorship.yaml +17 -4
  53. followthemoney/schema/Document.yaml +72 -139
  54. followthemoney/schema/Documentation.yml +38 -0
  55. followthemoney/schema/EconomicActivity.yaml +32 -17
  56. followthemoney/schema/Email.yaml +76 -0
  57. followthemoney/schema/Employment.yaml +39 -0
  58. followthemoney/schema/Event.yaml +35 -3
  59. followthemoney/schema/Family.yaml +41 -0
  60. followthemoney/schema/Folder.yaml +13 -0
  61. followthemoney/schema/HyperText.yaml +21 -0
  62. followthemoney/schema/Identification.yaml +40 -0
  63. followthemoney/schema/Image.yaml +25 -0
  64. followthemoney/schema/Interest.yaml +3 -6
  65. followthemoney/schema/Interval.yaml +56 -5
  66. followthemoney/schema/LegalEntity.yaml +81 -20
  67. followthemoney/schema/License.yaml +7 -3
  68. followthemoney/schema/Membership.yaml +19 -4
  69. followthemoney/schema/Mention.yaml +54 -0
  70. followthemoney/schema/Message.yaml +73 -0
  71. followthemoney/schema/Note.yaml +23 -0
  72. followthemoney/schema/Occupancy.yaml +40 -0
  73. followthemoney/schema/Organization.yaml +38 -3
  74. followthemoney/schema/Ownership.yaml +16 -4
  75. followthemoney/schema/Package.yaml +17 -0
  76. followthemoney/schema/Page.yaml +43 -0
  77. followthemoney/schema/Pages.yaml +23 -0
  78. followthemoney/schema/Passport.yaml +15 -17
  79. followthemoney/schema/Payment.yaml +38 -7
  80. followthemoney/schema/Person.yaml +61 -5
  81. followthemoney/schema/PlainText.yaml +17 -0
  82. followthemoney/schema/Position.yaml +50 -0
  83. followthemoney/schema/Post.yaml +42 -0
  84. followthemoney/schema/Project.yaml +27 -0
  85. followthemoney/schema/ProjectParticipant.yaml +36 -0
  86. followthemoney/schema/PublicBody.yaml +14 -3
  87. followthemoney/schema/RealEstate.yaml +19 -3
  88. followthemoney/schema/Representation.yaml +17 -6
  89. followthemoney/schema/Sanction.yaml +44 -20
  90. followthemoney/schema/Security.yaml +59 -0
  91. followthemoney/schema/Similar.yaml +37 -0
  92. followthemoney/schema/Succession.yaml +36 -0
  93. followthemoney/schema/Table.yaml +32 -0
  94. followthemoney/schema/TaxRoll.yaml +27 -9
  95. followthemoney/schema/Thing.yaml +69 -13
  96. followthemoney/schema/Trip.yaml +42 -0
  97. followthemoney/schema/UnknownLink.yaml +17 -6
  98. followthemoney/schema/UserAccount.yaml +44 -0
  99. followthemoney/schema/Value.yaml +5 -1
  100. followthemoney/schema/Vehicle.yaml +25 -8
  101. followthemoney/schema/Vessel.yaml +18 -10
  102. followthemoney/schema/Video.yaml +20 -0
  103. followthemoney/schema/Workbook.yaml +18 -0
  104. followthemoney/schema.py +406 -135
  105. followthemoney/translations/ar/LC_MESSAGES/followthemoney.mo +0 -0
  106. followthemoney/translations/ar/LC_MESSAGES/followthemoney.po +2900 -787
  107. followthemoney/translations/bs/LC_MESSAGES/followthemoney.mo +0 -0
  108. followthemoney/translations/bs/LC_MESSAGES/followthemoney.po +2108 -520
  109. followthemoney/translations/de/LC_MESSAGES/followthemoney.mo +0 -0
  110. followthemoney/translations/de/LC_MESSAGES/followthemoney.po +2902 -782
  111. followthemoney/translations/es/LC_MESSAGES/followthemoney.mo +0 -0
  112. followthemoney/translations/es/LC_MESSAGES/followthemoney.po +2893 -779
  113. followthemoney/translations/fr/LC_MESSAGES/followthemoney.mo +0 -0
  114. followthemoney/translations/fr/LC_MESSAGES/followthemoney.po +4362 -0
  115. followthemoney/translations/fr/followthemoney.po +3861 -0
  116. followthemoney/translations/messages.pot +3021 -725
  117. followthemoney/translations/nb/LC_MESSAGES/followthemoney.mo +0 -0
  118. followthemoney/translations/nb/LC_MESSAGES/followthemoney.po +3778 -0
  119. followthemoney/translations/nl/LC_MESSAGES/followthemoney.mo +0 -0
  120. followthemoney/translations/nl/LC_MESSAGES/followthemoney.po +3837 -0
  121. followthemoney/translations/pt_BR/LC_MESSAGES/followthemoney.mo +0 -0
  122. followthemoney/translations/pt_BR/LC_MESSAGES/followthemoney.po +3784 -0
  123. followthemoney/translations/ru/LC_MESSAGES/followthemoney.mo +0 -0
  124. followthemoney/translations/ru/LC_MESSAGES/followthemoney.po +2837 -539
  125. followthemoney/translations/ru/followthemoney.po +4221 -0
  126. followthemoney/translations/tr/LC_MESSAGES/followthemoney.mo +0 -0
  127. followthemoney/translations/tr/LC_MESSAGES/followthemoney.po +2073 -491
  128. followthemoney/types/__init__.py +35 -17
  129. followthemoney/types/address.py +41 -21
  130. followthemoney/types/checksum.py +25 -0
  131. followthemoney/types/common.py +233 -88
  132. followthemoney/types/country.py +89 -56
  133. followthemoney/types/date.py +59 -76
  134. followthemoney/types/email.py +66 -35
  135. followthemoney/types/entity.py +66 -13
  136. followthemoney/types/gender.py +66 -0
  137. followthemoney/types/iban.py +47 -28
  138. followthemoney/types/identifier.py +49 -22
  139. followthemoney/types/ip.py +35 -21
  140. followthemoney/types/json.py +58 -0
  141. followthemoney/types/language.py +124 -37
  142. followthemoney/types/mimetype.py +44 -0
  143. followthemoney/types/name.py +56 -12
  144. followthemoney/types/number.py +30 -0
  145. followthemoney/types/phone.py +92 -34
  146. followthemoney/types/registry.py +52 -0
  147. followthemoney/types/string.py +43 -0
  148. followthemoney/types/topic.py +94 -0
  149. followthemoney/types/url.py +39 -17
  150. followthemoney/util.py +139 -45
  151. followthemoney-3.8.0.dist-info/METADATA +153 -0
  152. followthemoney-3.8.0.dist-info/RECORD +157 -0
  153. {followthemoney-1.3.7.dist-info → followthemoney-3.8.0.dist-info}/WHEEL +1 -2
  154. followthemoney-3.8.0.dist-info/entry_points.txt +17 -0
  155. followthemoney-1.3.7.dist-info/LICENSE.txt → followthemoney-3.8.0.dist-info/licenses/LICENSE +1 -1
  156. followthemoney/link.py +0 -75
  157. followthemoney/schema/Associate.yml +0 -19
  158. followthemoney/schema/Family.yml +0 -19
  159. followthemoney/schema/Land.yml +0 -9
  160. followthemoney/schema/Relationship.yaml +0 -26
  161. followthemoney/types/domain.py +0 -50
  162. followthemoney-1.3.7.dist-info/DESCRIPTION.rst +0 -3
  163. followthemoney-1.3.7.dist-info/METADATA +0 -39
  164. followthemoney-1.3.7.dist-info/RECORD +0 -108
  165. followthemoney-1.3.7.dist-info/entry_points.txt +0 -3
  166. followthemoney-1.3.7.dist-info/metadata.json +0 -1
  167. followthemoney-1.3.7.dist-info/namespace_packages.txt +0 -1
  168. followthemoney-1.3.7.dist-info/top_level.txt +0 -3
  169. ns/ontology.py +0 -128
  170. tests/types/test_addresses.py +0 -24
  171. tests/types/test_common.py +0 -32
  172. tests/types/test_countries.py +0 -27
  173. tests/types/test_dates.py +0 -73
  174. tests/types/test_domains.py +0 -23
  175. tests/types/test_emails.py +0 -32
  176. tests/types/test_entity.py +0 -19
  177. tests/types/test_iban.py +0 -109
  178. tests/types/test_identifiers.py +0 -27
  179. tests/types/test_ip.py +0 -29
  180. tests/types/test_languages.py +0 -23
  181. tests/types/test_names.py +0 -33
  182. tests/types/test_phones.py +0 -24
  183. tests/types/test_registry.py +0 -14
  184. tests/types/test_urls.py +0 -23
  185. {ns → followthemoney/export}/__init__.py +0 -0
  186. /tests/types/__init__.py → /followthemoney/py.typed +0 -0
@@ -0,0 +1,212 @@
1
+ # This module violates the boundary between the role of code and
2
+ # YAML in the rest of followthemoney. It handles normalisations
3
+ # which would be much harder to express in abstract, especially
4
+ # those thet simplify the data based on their pragmatics.
5
+ #
6
+ # If anyone were to swap out the default model, this would
7
+ # probably be the first place to break.
8
+ from os.path import splitext
9
+ from typing import Iterable, List, Optional, Set
10
+ from normality import safe_filename
11
+ from mimetypes import guess_extension
12
+ from itertools import product
13
+ from datetime import datetime, timedelta
14
+
15
+ from followthemoney.types import registry
16
+ from followthemoney.proxy import E
17
+ from followthemoney.util import join_text
18
+
19
+ PROV_MIN_DATES = ("createdAt", "authoredAt", "publishedAt")
20
+ PROV_MAX_DATES = ("modifiedAt", "retrievedAt")
21
+
22
+
23
+ def remove_checksums(proxy: E) -> E:
24
+ """When accepting entities via a web API, it would consistute
25
+ a security risk to allow a user to submit checksum-type properties.
26
+ These can be traded in for access to said files if they exist in the
27
+ underlying content-addressed storage. It seems safest to just remove
28
+ all checksums from entities when they are untrusted user input."""
29
+ for prop in proxy.iterprops():
30
+ if prop.type == registry.checksum:
31
+ proxy.pop(prop)
32
+ return proxy
33
+
34
+
35
+ def simplify_provenance(proxy: E) -> E:
36
+ """If there are multiple dates given for some of the provenance
37
+ fields, we can logically conclude which one is the most meaningful."""
38
+ for prop_name in PROV_MAX_DATES:
39
+ values = proxy.pop(prop_name, quiet=True)
40
+ if len(values):
41
+ proxy.set(prop_name, max(values), cleaned=True)
42
+ for prop_name in PROV_MIN_DATES:
43
+ values = proxy.pop(prop_name, quiet=True)
44
+ if len(values):
45
+ proxy.set(prop_name, min(values), cleaned=True)
46
+ return proxy
47
+
48
+
49
+ def entity_filename(
50
+ proxy: E, base_name: Optional[str] = None, extension: Optional[str] = None
51
+ ) -> Optional[str]:
52
+ """Derive a safe filename for the given entity."""
53
+ if proxy.schema.is_a("Document"):
54
+ for extension_ in proxy.get("extension", quiet=True):
55
+ if extension is not None:
56
+ break
57
+ extension = extension_
58
+ for file_name in proxy.get("fileName", quiet=True):
59
+ base_name_, extension_ = splitext(file_name)
60
+ if base_name is None and len(base_name_):
61
+ base_name = base_name_
62
+ if extension is None and len(extension_):
63
+ extension = extension_
64
+ for mime_type in proxy.get("mimeType", quiet=True):
65
+ if extension is not None:
66
+ break
67
+ extension = guess_extension(mime_type)
68
+ base_name = base_name or proxy.id
69
+ return safe_filename(base_name, extension=extension)
70
+
71
+
72
+ def name_entity(entity: E) -> E:
73
+ """If an entity has multiple names, pick the most central one
74
+ and set all the others as aliases. This is awkward given that
75
+ names are not special and may not always be the caption."""
76
+ if entity.schema.is_a("Thing"):
77
+ names = entity.get("name")
78
+ if len(names) > 1:
79
+ name = registry.name.pick(names)
80
+ if name in names:
81
+ names.remove(name)
82
+ entity.set("name", name)
83
+ entity.add("alias", names)
84
+ return entity
85
+
86
+
87
+ def check_person_cutoff(
88
+ entity: E,
89
+ death_cutoff: datetime = datetime(2000, 1, 1),
90
+ birth_cutoff: Optional[datetime] = None,
91
+ ) -> bool:
92
+ """Check if a person has been dead long enough to not be relevant for
93
+ investigations any more."""
94
+ if not entity.schema.is_a("Person"):
95
+ return False
96
+ death_dates = entity.get("deathDate", quiet=True)
97
+ death_cutoff_ = death_cutoff.isoformat()
98
+ if len(death_dates) and max(death_dates) < death_cutoff_:
99
+ return True
100
+ birth_dates = entity.get("birthDate", quiet=True)
101
+ if birth_cutoff is None:
102
+ birth_cutoff = death_cutoff - timedelta(days=100 * 365)
103
+ birth_cutoff_ = birth_cutoff.isoformat()
104
+ if len(birth_dates) and min(birth_dates) < birth_cutoff_:
105
+ return True
106
+ return False
107
+
108
+
109
+ def remove_prefix_dates(entity: E) -> E:
110
+ """If an entity has multiple values for a date field, you may
111
+ want to remove all those that are prefixes of others. For example,
112
+ if a Person has both a birthDate of 1990 and of 1990-05-01, we'd
113
+ want to drop the mention of 1990."""
114
+ for prop in entity.iterprops():
115
+ if prop.type == registry.date:
116
+ values = remove_prefix_date_values(entity.get(prop))
117
+ entity.set(prop, values)
118
+ return entity
119
+
120
+
121
+ def remove_prefix_date_values(values: Iterable[str]) -> List[str]:
122
+ """See ``remove_prefix_dates``."""
123
+ kept: List[str] = []
124
+ values = sorted(values, key=len, reverse=True)
125
+ for index, value in enumerate(values):
126
+ keep = True
127
+ for longer in values[:index]:
128
+ if longer.startswith(value):
129
+ keep = False
130
+ break
131
+ if keep:
132
+ kept.append(value)
133
+ return kept
134
+
135
+
136
+ def inline_names(entity: E, related: E) -> None:
137
+ """Attempt to solve a weird UI problem. Imagine we are showing a list of
138
+ payments between a sender and a beneficiary to a user. They may now conduct
139
+ a search for a term present in the sender or recipient name, but there will
140
+ be no result, because the name is only indexed with the parties, but not in
141
+ the payment. This is part of a partial work-around to that.
142
+
143
+ This is really bad in theory, but really useful in practice. Shoot me.
144
+ """
145
+ prop = entity.schema.get("namesMentioned")
146
+ if prop is not None:
147
+ entity.add(prop, related.get_type_values(registry.name))
148
+
149
+
150
+ def combine_names(entity: E) -> E:
151
+ """This function will try to build names from name parts provided as part
152
+ of a person entity. This is of course impossible to do culturally correctly
153
+ for the whole planet at once, so it should be mostly used for internal-facing
154
+ (e.g. matching) processes."""
155
+ if entity.schema.is_a("Person"):
156
+ first_names = entity.get("firstName")
157
+ second_names = entity.get("secondName")
158
+ second_names.append("")
159
+ middle_names = entity.get("middleName")
160
+ middle_names.append("")
161
+ father_names = entity.get("fatherName")
162
+ father_names.append("")
163
+ last_names = entity.get("lastName")
164
+ for (first, second, middle, father, last) in product(
165
+ first_names, second_names, middle_names, father_names, last_names
166
+ ):
167
+ name = join_text(first, second, middle, father, last)
168
+ if name is not None:
169
+ entity.add("alias", name)
170
+
171
+ # If no first name is given, at least add the last name:
172
+ if not entity.get_type_values(registry.name) and len(last_names):
173
+ entity.add("alias", last_names)
174
+ return entity
175
+
176
+
177
+ def dates_years(dates: Iterable[Optional[str]]) -> Set[str]:
178
+ """Get the unique years from a set of date strings."""
179
+ cleaned: Set[str] = set()
180
+ for date in dates:
181
+ if date is not None:
182
+ cleaned.add(date[:4])
183
+ return cleaned
184
+
185
+
186
+ def post_summary(
187
+ organization: str,
188
+ role: Optional[str],
189
+ start_dates: Iterable[Optional[str]],
190
+ end_dates: Iterable[Optional[str]],
191
+ dates: Iterable[Optional[str]],
192
+ ) -> str:
193
+ """Make a string summary for a Post object."""
194
+ position = organization
195
+ start = min(dates_years(start_dates), default="")
196
+ end = min(dates_years(end_dates), default="")
197
+ date_range = None
198
+ if len(start) or len(end):
199
+ date_range = f"{start}-{end}"
200
+ dates_ = dates_years(dates)
201
+ if date_range is None and len(dates_):
202
+ date_range = ", ".join(sorted(dates_))
203
+
204
+ bracketed = None
205
+ if date_range and role:
206
+ bracketed = f"{role}, {date_range}"
207
+ else:
208
+ bracketed = role or date_range
209
+
210
+ if bracketed:
211
+ position = f"{position} ({bracketed})"
212
+ return position
@@ -1,3 +1,3 @@
1
1
  from followthemoney.mapping.query import QueryMapping
2
2
 
3
- __all__ = [QueryMapping]
3
+ __all__ = ["QueryMapping"]
@@ -1,65 +1,97 @@
1
1
  import io
2
2
  import os
3
3
  import logging
4
+ from banal.lists import ensure_list
4
5
  import requests
5
6
  from csv import DictReader
6
- from banal import ensure_list
7
- from normality import stringify
7
+ from urllib.parse import urlparse
8
+ from banal import keys_values
9
+ from typing import (
10
+ TYPE_CHECKING,
11
+ Any,
12
+ Dict,
13
+ Generator,
14
+ ItemsView,
15
+ Iterable,
16
+ List,
17
+ Optional,
18
+ Set,
19
+ Tuple,
20
+ cast,
21
+ )
8
22
 
9
- from followthemoney.mapping.source import Source
23
+ from followthemoney.mapping.source import Record, Source
24
+ from followthemoney.util import sanitize_text
10
25
  from followthemoney.exc import InvalidMapping
11
26
 
27
+ if TYPE_CHECKING:
28
+ from followthemoney.mapping.query import QueryMapping
29
+
12
30
  log = logging.getLogger(__name__)
31
+ FilterList = List[Tuple[str, Set[Optional[str]]]]
13
32
 
14
33
 
15
34
  class CSVSource(Source):
16
35
  """Special case for entity loading directly from a CSV URL"""
17
36
 
18
- def __init__(self, query, data):
19
- super(CSVSource, self).__init__(query, data)
20
- urls = ensure_list(data.get('csv_url'))
21
- urls.extend(ensure_list(data.get('csv_urls')))
22
- self.urls = set()
23
- for url in urls:
24
- self.urls.add(os.path.expandvars(url))
37
+ def __init__(self, query: "QueryMapping", data: Dict[str, Any]) -> None:
38
+ super().__init__(query, data)
39
+ self.urls: Set[str] = set()
40
+ for url in keys_values(data, "csv_url", "csv_urls"):
41
+ self.urls.add(cast(str, os.path.expandvars(url)))
25
42
 
26
43
  if not len(self.urls):
27
44
  raise InvalidMapping("No CSV URLs are specified.")
28
45
 
29
- def read_csv(self, url):
30
- parsed_url = requests.utils.urlparse(url)
46
+ self.filters_set = self._parse_filters(self.filters)
47
+ self.filters_not_set = self._parse_filters(self.filters_not)
48
+
49
+ def _parse_filters(self, filters: ItemsView[str, Any]) -> FilterList:
50
+ filters_set: FilterList = []
51
+ for (key, value) in filters:
52
+ values = set(cast(List[Optional[str]], ensure_list(value)))
53
+ filters_set.append((key, values))
54
+ return filters_set
55
+
56
+ def check_filters(self, data: Record) -> bool:
57
+ for (k, v) in self.filters_set:
58
+ if data.get(k) not in v:
59
+ return False
60
+ for (k, v) in self.filters_not_set:
61
+ if data.get(k) in v:
62
+ return False
63
+ return True
64
+
65
+ @classmethod
66
+ def read_csv(cls, fh: Iterable[str]) -> Generator[Record, None, None]:
67
+ for row in DictReader(fh, skipinitialspace=True):
68
+ data: Record = {}
69
+ for ref, ref_value in row.items():
70
+ value = sanitize_text(ref_value)
71
+ if value is not None:
72
+ data[ref] = value
73
+ yield data
74
+
75
+ def read_csv_url(self, url: str) -> Generator[Record, None, None]:
76
+ parsed_url = urlparse(url)
31
77
  log.info("Loading: %s", url)
32
- if parsed_url.scheme in ['http', 'https']:
78
+ if parsed_url.scheme in ["http", "https"]:
33
79
  res = requests.get(url, stream=True)
34
80
  if not res.ok:
35
81
  raise InvalidMapping("Failed to open CSV: %s" % url)
36
82
  # if res.encoding is None:
37
- res.encoding = 'utf-8'
83
+ res.encoding = "utf-8"
38
84
  # log.info("Detected encoding: %s", res.encoding)
39
85
  lines = res.iter_lines(decode_unicode=True)
40
- for row in DictReader(lines, skipinitialspace=True):
41
- yield row
86
+ yield from self.read_csv(lines)
42
87
  else:
43
- with io.open(parsed_url.path, 'r') as fh:
44
- for row in DictReader(fh, skipinitialspace=True):
45
- yield row
46
-
47
- def check_filters(self, data):
48
- for (k, v) in self.filters:
49
- if v != data.get(k):
50
- return False
51
- for (k, v) in self.filters_not:
52
- if v == data.get(k):
53
- return False
54
- return True
88
+ with io.open(parsed_url.path, "r") as fh:
89
+ yield from self.read_csv(fh)
55
90
 
56
91
  @property
57
- def records(self):
92
+ def records(self) -> Generator[Record, None, None]:
58
93
  """Iterate through the table applying filters on-the-go."""
59
94
  for url in self.urls:
60
- for row in self.read_csv(url):
61
- data = {}
62
- for ref in self.query.refs:
63
- data[ref] = stringify(row.get(ref))
64
- if self.check_filters(data):
65
- yield data
95
+ for record in self.read_csv_url(url):
96
+ if self.check_filters(record):
97
+ yield record
@@ -1,62 +1,111 @@
1
+ import logging
1
2
  from hashlib import sha1
2
- from banal import ensure_list
3
+ from warnings import warn
4
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set
5
+ from banal import keys_values
6
+ from normality import stringify
3
7
 
4
- from followthemoney.mapping.property import PropertyMapping
5
8
  from followthemoney.types import registry
6
9
  from followthemoney.util import key_bytes
10
+ from followthemoney.proxy import EntityProxy
11
+ from followthemoney.mapping.property import PropertyMapping
12
+ from followthemoney.mapping.source import Record
7
13
  from followthemoney.exc import InvalidMapping
8
14
 
15
+ if TYPE_CHECKING:
16
+ from followthemoney.model import Model
17
+ from followthemoney.mapping.query import QueryMapping
18
+
19
+ log = logging.getLogger(__name__)
20
+
9
21
 
10
22
  class EntityMapping(object):
23
+ __slots__ = (
24
+ "model",
25
+ "name",
26
+ "seed",
27
+ "keys",
28
+ "id_column",
29
+ "schema",
30
+ "refs",
31
+ "dependencies",
32
+ "properties",
33
+ )
11
34
 
12
- def __init__(self, model, query, name, data, key_prefix=None):
35
+ def __init__(
36
+ self,
37
+ model: "Model",
38
+ query: "QueryMapping",
39
+ name: str,
40
+ data: Dict[str, Any],
41
+ key_prefix: Optional[str] = None,
42
+ ) -> None:
13
43
  self.model = model
14
44
  self.name = name
15
- self.data = data
16
45
 
17
46
  self.seed = sha1(key_bytes(key_prefix))
18
- self.seed.update(key_bytes(data.get('key_literal')))
47
+ self.seed.update(key_bytes(data.get("key_literal")))
19
48
 
20
- self.keys = ensure_list(data.get('key'))
21
- self.keys.extend(ensure_list(data.get('keys')))
22
- if not len(self.keys):
23
- raise InvalidMapping("No keys: %r" % name)
49
+ self.keys = keys_values(data, "key", "keys")
50
+ self.id_column = stringify(data.get("id_column"))
51
+ if not len(self.keys) and self.id_column is None:
52
+ raise InvalidMapping("No keys or ID: %r" % name)
53
+ if len(self.keys) and self.id_column is not None:
54
+ msg = "Please use only keys or id_column, not both: %r" % name
55
+ raise InvalidMapping(msg)
24
56
 
25
- self.schema = model.get(data.get('schema'))
26
- if self.schema is None:
27
- raise InvalidMapping("Invalid schema: %s" % data.get('schema'))
57
+ schema_name = stringify(data.get("schema"))
58
+ if schema_name is None:
59
+ raise InvalidMapping("No schema: %s" % name)
60
+ schema = model.get(schema_name)
61
+ if schema is None:
62
+ raise InvalidMapping("Invalid schema: %s" % schema_name)
63
+ if schema.deprecated:
64
+ warn(
65
+ "Mapping uses a deprecated schema: %r" % schema,
66
+ DeprecationWarning,
67
+ stacklevel=2,
68
+ )
69
+ self.schema = schema
28
70
 
29
71
  self.refs = set(self.keys)
30
- self.dependencies = set()
31
- self.properties = []
32
- for name, prop in data.get('properties', {}).items():
33
- prop_schema = self.schema.get(name)
34
- if prop_schema is None:
72
+ if self.id_column:
73
+ self.refs.add(self.id_column)
74
+ self.dependencies: Set[str] = set()
75
+ self.properties: List[PropertyMapping] = []
76
+ for name, prop_mapping in data.get("properties", {}).items():
77
+ prop = self.schema.get(name)
78
+ if prop is None:
35
79
  raise InvalidMapping("Invalid property: %s" % name)
36
- prop = PropertyMapping(query, prop, prop_schema)
37
- self.properties.append(prop)
38
- self.refs.update(prop.refs)
39
- if prop.entity:
40
- self.dependencies.add(prop.entity)
80
+ mapping = PropertyMapping(query, prop_mapping, prop)
81
+ self.properties.append(mapping)
82
+ self.refs.update(mapping.refs)
83
+ if mapping.entity:
84
+ self.dependencies.add(mapping.entity)
41
85
 
42
- def bind(self):
86
+ def bind(self) -> None:
43
87
  for prop in self.properties:
44
88
  prop.bind()
45
89
 
46
- def compute_key(self, record):
90
+ def compute_key(self, record: Record) -> Optional[str]:
47
91
  """Generate a key for this entity, based on the given fields."""
92
+ if self.id_column is not None:
93
+ return record.get(self.id_column)
48
94
  values = [key_bytes(record.get(k)) for k in self.keys]
49
95
  digest = self.seed.copy()
96
+ has_value = False
50
97
  for value in sorted(values):
51
- digest.update(value)
52
- if digest.digest() != self.seed.digest():
98
+ if len(value):
99
+ has_value = True
100
+ digest.update(value)
101
+ if has_value:
53
102
  return digest.hexdigest()
103
+ return None
54
104
 
55
- def map(self, record, entities):
105
+ def map(
106
+ self, record: Record, entities: Dict[str, EntityProxy]
107
+ ) -> Optional[EntityProxy]:
56
108
  proxy = self.model.make_entity(self.schema)
57
- proxy.id = self.compute_key(record)
58
- if proxy.id is None:
59
- return
60
109
 
61
110
  # THIS IS HACKY
62
111
  # Some of the converters, e.g. for phone numbers, work better if they
@@ -64,23 +113,46 @@ class EntityMapping(object):
64
113
  # detail, we are first running country fields, then making the data
65
114
  # from that accessible to phone and address parsers.
66
115
  for prop in self.properties:
67
- if prop.schema.type == registry.country:
68
- proxy.add(prop.schema, prop.map(record, entities))
116
+ if prop.prop.type == registry.country:
117
+ discarded_values = prop.map(proxy, record, entities)
118
+ for value in discarded_values:
119
+ log.warning(
120
+ f'[{self.name}] Discarded unclean value "{value}" for property "{prop.prop.qname}".'
121
+ )
69
122
 
70
123
  for prop in self.properties:
71
- if prop.schema.type != registry.country:
72
- proxy.add(prop.schema, prop.map(record, entities,
73
- countries=proxy.countries))
124
+ if prop.prop.type != registry.country:
125
+ discarded_values = prop.map(proxy, record, entities)
126
+ for value in discarded_values:
127
+ log.warning(
128
+ f'[{self.name}] Discarding unclean value "{value}" for property "{prop.prop.qname}".'
129
+ )
130
+
131
+ # Generate the ID at the end to avoid self-reference checks on empty
132
+ # keys.
133
+ proxy.id = self.compute_key(record)
134
+ if proxy.id is None:
135
+ if self.id_column:
136
+ log.warning(
137
+ f'[{self.name}] Skipping entity because no ID could be computed. Make sure that there are no empty values in the "{self.id_column}" column.'
138
+ )
139
+ if self.keys:
140
+ log.warning(
141
+ f"[{self.name}] Skipping entity because no ID could be computed. Make sure that there are no empty values in key columns."
142
+ )
143
+ return None
74
144
 
75
145
  for prop in self.properties:
76
- if prop.required:
77
- if not len(proxy.get(prop.schema)):
78
- # This is a bit weird, it flags fields to be required in
79
- # the mapping, not in the model. Basically it means: if
80
- # this row of source data doesn't have that field, then do
81
- # not map it again.
82
- return
146
+ if prop.required and not proxy.has(prop.prop):
147
+ # This is a bit weird, it flags fields to be required in
148
+ # the mapping, not in the model. Basically it means: if
149
+ # this row of source data doesn't have that field, then do
150
+ # not map it again.
151
+ log.warning(
152
+ f'[{self.name}] Skipping entity because required property "{prop.prop.name}" is empty.'
153
+ )
154
+ return None
83
155
  return proxy
84
156
 
85
- def __repr__(self):
86
- return '<EntityMapping(%r)>' % self.name
157
+ def __repr__(self) -> str:
158
+ return "<EntityMapping(%r)>" % self.name