followthemoney 1.3.6__py3-none-any.whl → 3.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (186) hide show
  1. followthemoney/__init__.py +5 -3
  2. followthemoney/cli/__init__.py +17 -0
  3. followthemoney/cli/aggregate.py +56 -0
  4. followthemoney/cli/cli.py +88 -0
  5. followthemoney/cli/exports.py +121 -0
  6. followthemoney/cli/mapping.py +85 -0
  7. followthemoney/cli/sieve.py +67 -0
  8. followthemoney/cli/util.py +142 -0
  9. followthemoney/compare.py +132 -55
  10. followthemoney/exc.py +19 -6
  11. followthemoney/export/common.py +29 -0
  12. followthemoney/export/csv.py +82 -0
  13. followthemoney/export/excel.py +75 -0
  14. followthemoney/export/graph.py +79 -0
  15. followthemoney/export/neo4j.py +182 -0
  16. followthemoney/export/rdf.py +26 -0
  17. followthemoney/graph.py +308 -0
  18. followthemoney/helpers.py +212 -0
  19. followthemoney/mapping/__init__.py +1 -1
  20. followthemoney/mapping/csv.py +67 -35
  21. followthemoney/mapping/entity.py +116 -44
  22. followthemoney/mapping/property.py +90 -44
  23. followthemoney/mapping/query.py +27 -19
  24. followthemoney/mapping/source.py +15 -5
  25. followthemoney/mapping/sql.py +75 -61
  26. followthemoney/messages.py +13 -7
  27. followthemoney/model.py +108 -56
  28. followthemoney/namespace.py +119 -0
  29. followthemoney/offshore.py +48 -0
  30. followthemoney/ontology.py +77 -0
  31. followthemoney/property.py +204 -71
  32. followthemoney/proxy.py +455 -118
  33. followthemoney/rdf.py +9 -0
  34. followthemoney/schema/Address.yaml +78 -0
  35. followthemoney/schema/Airplane.yaml +17 -10
  36. followthemoney/schema/Analyzable.yaml +54 -0
  37. followthemoney/schema/Article.yaml +16 -0
  38. followthemoney/schema/Assessment.yaml +32 -0
  39. followthemoney/schema/Asset.yaml +10 -4
  40. followthemoney/schema/Associate.yaml +41 -0
  41. followthemoney/schema/Audio.yaml +24 -0
  42. followthemoney/schema/BankAccount.yaml +53 -9
  43. followthemoney/schema/Call.yaml +48 -0
  44. followthemoney/schema/CallForTenders.yaml +117 -0
  45. followthemoney/schema/Company.yaml +37 -12
  46. followthemoney/schema/Contract.yaml +41 -7
  47. followthemoney/schema/ContractAward.yaml +30 -11
  48. followthemoney/schema/CourtCase.yaml +16 -10
  49. followthemoney/schema/CourtCaseParty.yaml +17 -6
  50. followthemoney/schema/CryptoWallet.yaml +48 -0
  51. followthemoney/schema/Debt.yaml +37 -0
  52. followthemoney/schema/Directorship.yaml +17 -4
  53. followthemoney/schema/Document.yaml +72 -139
  54. followthemoney/schema/Documentation.yml +38 -0
  55. followthemoney/schema/EconomicActivity.yaml +32 -17
  56. followthemoney/schema/Email.yaml +76 -0
  57. followthemoney/schema/Employment.yaml +39 -0
  58. followthemoney/schema/Event.yaml +35 -3
  59. followthemoney/schema/Family.yaml +41 -0
  60. followthemoney/schema/Folder.yaml +13 -0
  61. followthemoney/schema/HyperText.yaml +21 -0
  62. followthemoney/schema/Identification.yaml +40 -0
  63. followthemoney/schema/Image.yaml +25 -0
  64. followthemoney/schema/Interest.yaml +3 -6
  65. followthemoney/schema/Interval.yaml +56 -5
  66. followthemoney/schema/LegalEntity.yaml +81 -20
  67. followthemoney/schema/License.yaml +7 -3
  68. followthemoney/schema/Membership.yaml +19 -4
  69. followthemoney/schema/Mention.yaml +54 -0
  70. followthemoney/schema/Message.yaml +73 -0
  71. followthemoney/schema/Note.yaml +23 -0
  72. followthemoney/schema/Occupancy.yaml +40 -0
  73. followthemoney/schema/Organization.yaml +38 -3
  74. followthemoney/schema/Ownership.yaml +16 -4
  75. followthemoney/schema/Package.yaml +17 -0
  76. followthemoney/schema/Page.yaml +43 -0
  77. followthemoney/schema/Pages.yaml +23 -0
  78. followthemoney/schema/Passport.yaml +15 -17
  79. followthemoney/schema/Payment.yaml +38 -7
  80. followthemoney/schema/Person.yaml +61 -5
  81. followthemoney/schema/PlainText.yaml +17 -0
  82. followthemoney/schema/Position.yaml +50 -0
  83. followthemoney/schema/Post.yaml +42 -0
  84. followthemoney/schema/Project.yaml +27 -0
  85. followthemoney/schema/ProjectParticipant.yaml +36 -0
  86. followthemoney/schema/PublicBody.yaml +14 -3
  87. followthemoney/schema/RealEstate.yaml +19 -3
  88. followthemoney/schema/Representation.yaml +17 -6
  89. followthemoney/schema/Sanction.yaml +44 -20
  90. followthemoney/schema/Security.yaml +59 -0
  91. followthemoney/schema/Similar.yaml +37 -0
  92. followthemoney/schema/Succession.yaml +36 -0
  93. followthemoney/schema/Table.yaml +32 -0
  94. followthemoney/schema/TaxRoll.yaml +27 -9
  95. followthemoney/schema/Thing.yaml +69 -13
  96. followthemoney/schema/Trip.yaml +42 -0
  97. followthemoney/schema/UnknownLink.yaml +17 -6
  98. followthemoney/schema/UserAccount.yaml +44 -0
  99. followthemoney/schema/Value.yaml +5 -1
  100. followthemoney/schema/Vehicle.yaml +25 -8
  101. followthemoney/schema/Vessel.yaml +18 -10
  102. followthemoney/schema/Video.yaml +20 -0
  103. followthemoney/schema/Workbook.yaml +18 -0
  104. followthemoney/schema.py +406 -135
  105. followthemoney/translations/ar/LC_MESSAGES/followthemoney.mo +0 -0
  106. followthemoney/translations/ar/LC_MESSAGES/followthemoney.po +2900 -787
  107. followthemoney/translations/bs/LC_MESSAGES/followthemoney.mo +0 -0
  108. followthemoney/translations/bs/LC_MESSAGES/followthemoney.po +2108 -520
  109. followthemoney/translations/de/LC_MESSAGES/followthemoney.mo +0 -0
  110. followthemoney/translations/de/LC_MESSAGES/followthemoney.po +2902 -782
  111. followthemoney/translations/es/LC_MESSAGES/followthemoney.mo +0 -0
  112. followthemoney/translations/es/LC_MESSAGES/followthemoney.po +2893 -779
  113. followthemoney/translations/fr/LC_MESSAGES/followthemoney.mo +0 -0
  114. followthemoney/translations/fr/LC_MESSAGES/followthemoney.po +4362 -0
  115. followthemoney/translations/fr/followthemoney.po +3861 -0
  116. followthemoney/translations/messages.pot +3021 -725
  117. followthemoney/translations/nb/LC_MESSAGES/followthemoney.mo +0 -0
  118. followthemoney/translations/nb/LC_MESSAGES/followthemoney.po +3778 -0
  119. followthemoney/translations/nl/LC_MESSAGES/followthemoney.mo +0 -0
  120. followthemoney/translations/nl/LC_MESSAGES/followthemoney.po +3837 -0
  121. followthemoney/translations/pt_BR/LC_MESSAGES/followthemoney.mo +0 -0
  122. followthemoney/translations/pt_BR/LC_MESSAGES/followthemoney.po +3784 -0
  123. followthemoney/translations/ru/LC_MESSAGES/followthemoney.mo +0 -0
  124. followthemoney/translations/ru/LC_MESSAGES/followthemoney.po +2837 -539
  125. followthemoney/translations/ru/followthemoney.po +4221 -0
  126. followthemoney/translations/tr/LC_MESSAGES/followthemoney.mo +0 -0
  127. followthemoney/translations/tr/LC_MESSAGES/followthemoney.po +2073 -491
  128. followthemoney/types/__init__.py +35 -17
  129. followthemoney/types/address.py +41 -21
  130. followthemoney/types/checksum.py +25 -0
  131. followthemoney/types/common.py +233 -88
  132. followthemoney/types/country.py +89 -56
  133. followthemoney/types/date.py +59 -76
  134. followthemoney/types/email.py +66 -35
  135. followthemoney/types/entity.py +66 -13
  136. followthemoney/types/gender.py +66 -0
  137. followthemoney/types/iban.py +47 -28
  138. followthemoney/types/identifier.py +49 -22
  139. followthemoney/types/ip.py +35 -21
  140. followthemoney/types/json.py +58 -0
  141. followthemoney/types/language.py +124 -37
  142. followthemoney/types/mimetype.py +44 -0
  143. followthemoney/types/name.py +56 -12
  144. followthemoney/types/number.py +30 -0
  145. followthemoney/types/phone.py +92 -34
  146. followthemoney/types/registry.py +52 -0
  147. followthemoney/types/string.py +43 -0
  148. followthemoney/types/topic.py +94 -0
  149. followthemoney/types/url.py +39 -17
  150. followthemoney/util.py +139 -45
  151. followthemoney-3.8.0.dist-info/METADATA +153 -0
  152. followthemoney-3.8.0.dist-info/RECORD +157 -0
  153. {followthemoney-1.3.6.dist-info → followthemoney-3.8.0.dist-info}/WHEEL +1 -2
  154. followthemoney-3.8.0.dist-info/entry_points.txt +17 -0
  155. followthemoney-1.3.6.dist-info/LICENSE.txt → followthemoney-3.8.0.dist-info/licenses/LICENSE +1 -1
  156. followthemoney/link.py +0 -75
  157. followthemoney/schema/Associate.yml +0 -19
  158. followthemoney/schema/Family.yml +0 -19
  159. followthemoney/schema/Land.yml +0 -9
  160. followthemoney/schema/Relationship.yaml +0 -26
  161. followthemoney/types/domain.py +0 -50
  162. followthemoney-1.3.6.dist-info/DESCRIPTION.rst +0 -3
  163. followthemoney-1.3.6.dist-info/METADATA +0 -39
  164. followthemoney-1.3.6.dist-info/RECORD +0 -108
  165. followthemoney-1.3.6.dist-info/entry_points.txt +0 -3
  166. followthemoney-1.3.6.dist-info/metadata.json +0 -1
  167. followthemoney-1.3.6.dist-info/namespace_packages.txt +0 -1
  168. followthemoney-1.3.6.dist-info/top_level.txt +0 -3
  169. ns/ontology.py +0 -128
  170. tests/types/test_addresses.py +0 -24
  171. tests/types/test_common.py +0 -27
  172. tests/types/test_countries.py +0 -21
  173. tests/types/test_dates.py +0 -72
  174. tests/types/test_domains.py +0 -23
  175. tests/types/test_emails.py +0 -30
  176. tests/types/test_entity.py +0 -16
  177. tests/types/test_iban.py +0 -109
  178. tests/types/test_identifiers.py +0 -25
  179. tests/types/test_ip.py +0 -26
  180. tests/types/test_languages.py +0 -20
  181. tests/types/test_names.py +0 -33
  182. tests/types/test_phones.py +0 -24
  183. tests/types/test_registry.py +0 -14
  184. tests/types/test_urls.py +0 -23
  185. {ns → followthemoney/export}/__init__.py +0 -0
  186. /tests/types/__init__.py → /followthemoney/py.typed +0 -0
@@ -1,18 +1,24 @@
1
1
  import os
2
- import six
3
2
  import logging
4
3
  from uuid import uuid4
5
- from banal import ensure_list
6
- from normality import stringify
7
- from sqlalchemy import create_engine, MetaData
8
- from sqlalchemy import select, func
9
- # from sqlalchemy import text as sql_text
4
+ from typing import TYPE_CHECKING, Any, Dict, Generator, List, Optional, Union, cast
5
+ from banal import ensure_list, is_listish, keys_values
6
+ from sqlalchemy import MetaData, func
7
+ from sqlalchemy.future import select
8
+ from sqlalchemy.engine import Engine, create_engine
9
+ from sqlalchemy.sql.elements import Label
10
10
  from sqlalchemy.pool import NullPool
11
11
  from sqlalchemy.schema import Table
12
+ from sqlalchemy.sql.expression import Select
12
13
 
13
- from followthemoney.mapping.source import Source
14
+ from followthemoney.mapping.source import Record, Source
15
+ from followthemoney.util import sanitize_text
14
16
  from followthemoney.exc import InvalidMapping
15
17
 
18
+ if TYPE_CHECKING:
19
+ from followthemoney.mapping.query import QueryMapping
20
+
21
+
16
22
  log = logging.getLogger(__name__)
17
23
  DATA_PAGE = 1000
18
24
 
@@ -20,20 +26,22 @@ DATA_PAGE = 1000
20
26
  class QueryTable(object):
21
27
  """A table to be joined in."""
22
28
 
23
- def __init__(self, query, data):
24
- self.query = query
25
- if isinstance(data, six.string_types):
26
- data = {'table': data}
27
- self.data = data
28
- self.table_ref = data.get('table')
29
- self.alias_ref = data.get('alias', self.table_ref)
30
- self.table = Table(self.table_ref, self.query.meta, autoload=True)
31
- self.alias = self.table.alias(self.alias_ref)
32
-
33
- self.refs = {}
29
+ def __init__(
30
+ self, meta: MetaData, engine: Engine, data: Union[str, Dict[str, str]]
31
+ ) -> None:
32
+ if isinstance(data, str):
33
+ data = {"table": data}
34
+ table_ref = data.get("table")
35
+ if table_ref is None:
36
+ raise InvalidMapping("Query has no table!")
37
+ alias_ref = data.get("alias", table_ref)
38
+ self.table = Table(table_ref, meta, autoload_with=engine)
39
+ self.alias = self.table.alias(alias_ref)
40
+
41
+ self.refs: Dict[str, Label[Any]] = {}
34
42
  for column in self.alias.columns:
35
- name = '%s.%s' % (self.alias_ref, column.name)
36
- labeled_column = column.label('col_%s' % uuid4().hex[:10])
43
+ name = "%s.%s" % (alias_ref, column.name)
44
+ labeled_column = column.label("col_%s" % uuid4().hex[:10])
37
45
  self.refs[name] = labeled_column
38
46
  self.refs[column.name] = labeled_column
39
47
 
@@ -41,70 +49,76 @@ class QueryTable(object):
41
49
  class SQLSource(Source):
42
50
  """Query mapper for loading data from a SQL query."""
43
51
 
44
- def __init__(self, query, data):
52
+ def __init__(self, query: "QueryMapping", data: Dict[str, Any]) -> None:
45
53
  super(SQLSource, self).__init__(query, data)
46
- self.database_uri = os.path.expandvars(data.get('database'))
47
- kwargs = {}
48
- if self.database_uri.lower().startswith('postgres'):
49
- kwargs['server_side_cursors'] = True
50
- self.engine = create_engine(self.database_uri,
51
- poolclass=NullPool,
52
- **kwargs)
54
+ database = data.get("database")
55
+ if database is None:
56
+ raise InvalidMapping("No database in SQL mapping!")
57
+ self.database_uri = cast(str, os.path.expandvars(database))
58
+ self.engine = create_engine(self.database_uri, poolclass=NullPool)
53
59
  self.meta = MetaData()
54
- self.meta.bind = self.engine
55
60
 
56
- tables = ensure_list(data.get('table'))
57
- tables.extend(ensure_list(data.get('tables')))
58
- self.tables = [QueryTable(self, f) for f in tables]
59
- self.joins = ensure_list(data.get('joins'))
61
+ tables = keys_values(data, "table", "tables")
62
+ self.tables = [QueryTable(self.meta, self.engine, f) for f in tables]
63
+ self.joins = cast(List[Dict[str, str]], ensure_list(data.get("joins")))
60
64
 
61
- def get_column(self, ref):
65
+ def get_column(self, ref: Optional[str]) -> Label[Any]:
62
66
  for table in self.tables:
63
67
  if ref in table.refs:
64
- return table.refs.get(ref)
68
+ return table.refs[ref]
65
69
  raise InvalidMapping("Missing reference: %s" % ref)
66
70
 
67
- def apply_filters(self, q):
71
+ def apply_filters(self, q: Select) -> Select:
68
72
  for col, val in self.filters:
69
- q = q.where(self.get_column(col) == val)
73
+ if is_listish(val):
74
+ q = q.where(self.get_column(col).in_(val))
75
+ else:
76
+ q = q.where(self.get_column(col) == val)
70
77
  for col, val in self.filters_not:
71
- q = q.where(self.get_column(col) != val)
78
+ if is_listish(val):
79
+ q = q.where(self.get_column(col).notin_(val))
80
+ else:
81
+ q = q.where(self.get_column(col) != val)
72
82
  # not sure this is a great idea:
73
83
  # if self.data.get('where'):
74
84
  # q = q.where(sql_text(self.data.get('where')))
75
85
  for join in self.joins:
76
- left = self.get_column(join.get('left'))
77
- right = self.get_column(join.get('right'))
86
+ left = self.get_column(join.get("left"))
87
+ right = self.get_column(join.get("right"))
78
88
  q = q.where(left == right)
79
89
  return q
80
90
 
81
- def compose_query(self):
82
- from_clause = [t.alias for t in self.tables]
91
+ def compose_query(self) -> Select:
83
92
  columns = [self.get_column(r) for r in self.query.refs]
84
- q = select(columns=columns, from_obj=from_clause, use_labels=True)
93
+ q = select(*columns)
94
+ q = q.select_from(*[t.alias for t in self.tables])
85
95
  return self.apply_filters(q)
86
96
 
87
97
  @property
88
- def records(self):
98
+ def records(self) -> Generator[Record, None, None]:
89
99
  """Compose the actual query and return an iterator of ``Record``."""
90
100
  mapping = [(r, self.get_column(r).name) for r in self.query.refs]
91
101
  q = self.compose_query()
92
102
  log.info("Query: %s", q)
93
- rp = self.engine.execute(q)
94
- while True:
95
- rows = rp.fetchmany(size=DATA_PAGE)
96
- if not len(rows):
97
- break
98
- for row in rows:
99
- data = {}
100
- for ref, name in mapping:
101
- data[ref] = stringify(row[name])
102
- yield data
103
-
104
- def __len__(self):
105
- from_clause = [t.alias for t in self.tables]
106
- columns = [func.count('*')]
107
- q = select(columns=columns, from_obj=from_clause, use_labels=True)
103
+ with self.engine.connect() as conn:
104
+ rp = conn.execution_options(stream_results=True).execute(q)
105
+ while True:
106
+ rows = rp.fetchmany(size=DATA_PAGE)
107
+ if not len(rows):
108
+ break
109
+ for row in rows:
110
+ row_map = row._mapping
111
+ data: Record = {}
112
+ for ref, name in mapping:
113
+ value = sanitize_text(row_map[name])
114
+ if value is not None:
115
+ data[ref] = value
116
+ yield data
117
+
118
+ def __len__(self) -> int:
119
+ q = select(func.count("*"))
120
+ q = q.select_from(*[t.alias for t in self.tables])
108
121
  q = self.apply_filters(q)
109
- rp = self.engine.execute(q)
110
- return rp.scalar()
122
+ with self.engine.connect() as conn:
123
+ rp = conn.execute(q)
124
+ return int(rp.scalar() or 0)
@@ -1,19 +1,25 @@
1
- import six
2
1
  import yaml
2
+ from typing import Any, Dict, Generator, List, TextIO, Tuple
3
3
 
4
+ Message = Tuple[Any, Any, List[str], List[str]]
4
5
 
5
- def extract_object(data, path):
6
+
7
+ def extract_object(
8
+ data: Dict[str, Any], path: List[str]
9
+ ) -> Generator[Message, None, None]:
6
10
  for key, value in data.items():
7
11
  subpath = path + [key]
8
- if isinstance(value, six.string_types):
9
- if key in ['label', 'reverse', 'description', 'plural']:
10
- comment = '.'.join(subpath)
12
+ if isinstance(value, str):
13
+ if key in ["label", "reverse", "description", "plural"]:
14
+ comment = ".".join(subpath)
11
15
  yield (None, None, [value], [comment])
12
16
  if isinstance(value, dict):
13
17
  for res in extract_object(value, subpath):
14
18
  yield res
15
19
 
16
20
 
17
- def extract_yaml(fileobj, keywords, comment_tags, options):
18
- data = yaml.load(fileobj)
21
+ def extract_yaml(
22
+ fileobj: TextIO, keywords: Any, comment_tags: Any, options: Any
23
+ ) -> Generator[Message, None, None]:
24
+ data = yaml.safe_load(fileobj)
19
25
  return extract_object(data, [])
followthemoney/model.py CHANGED
@@ -1,66 +1,109 @@
1
1
  import os
2
2
  import yaml
3
+ from typing import Any, Dict, Generator, Iterator, Optional, Set, TypedDict, Union
3
4
 
4
- from followthemoney.schema import Schema
5
+ from followthemoney.types import registry
6
+ from followthemoney.types.common import PropertyType, PropertyTypeToDict
7
+ from followthemoney.schema import Schema, SchemaToDict
8
+ from followthemoney.property import Property
5
9
  from followthemoney.mapping import QueryMapping
6
10
  from followthemoney.proxy import EntityProxy
7
11
  from followthemoney.exc import InvalidModel, InvalidData
8
12
 
9
13
 
14
+ class ModelToDict(TypedDict):
15
+ schemata: Dict[str, SchemaToDict]
16
+ types: Dict[str, PropertyTypeToDict]
17
+
18
+
10
19
  class Model(object):
11
- """A collection of schemata."""
20
+ """A collection of all the schemata available in followthemoney. The model
21
+ provides some helper functions to find schemata, properties or to instantiate
22
+ entity proxies based on the schema metadata."""
23
+
24
+ __slots__ = ("path", "schemata", "properties", "qnames")
12
25
 
13
- def __init__(self, path):
26
+ def __init__(self, path: str) -> None:
14
27
  self.path = path
15
- self.schemata = {}
28
+
29
+ #: A mapping with all schemata, organised by their name.
30
+ self.schemata: Dict[str, Schema] = {}
31
+
32
+ #: All properties defined in the model.
33
+ self.properties: Set[Property] = set()
34
+ self.qnames: Dict[str, Property] = {}
16
35
  for (path, _, filenames) in os.walk(self.path):
17
36
  for filename in filenames:
18
37
  self._load(os.path.join(path, filename))
19
38
  self.generate()
20
39
 
21
- def generate(self):
22
- self.properties = set()
40
+ def generate(self) -> None:
41
+ """Loading the model is a weird process because the schemata reference
42
+ each other in complex ways, so the generation process cannot be fully
43
+ run as schemata are being instantiated. Hence this process needs to be
44
+ called once all schemata are loaded to finalise dereferencing the
45
+ schemata."""
23
46
  for schema in self:
24
- schema.generate()
25
- for prop in schema.properties.values():
26
- self.properties.add(prop)
27
-
28
- def _load(self, filepath):
29
- with open(filepath, 'r') as fh:
30
- data = yaml.load(fh)
47
+ schema.generate(self)
48
+ for prop in self.properties:
49
+ self.qnames[prop.qname] = prop
50
+ for schema in prop.schema.descendants:
51
+ if prop.name not in schema.properties:
52
+ schema.properties[prop.name] = prop
53
+
54
+ def _load(self, filepath: str) -> None:
55
+ with open(filepath, "r", encoding="utf-8") as fh:
56
+ data = yaml.safe_load(fh)
31
57
  if not isinstance(data, dict):
32
- raise InvalidModel('Model file is not a mapping.')
58
+ raise InvalidModel("Model file is not a mapping: %s" % filepath)
33
59
  for name, config in data.items():
34
60
  self.schemata[name] = Schema(self, name, config)
35
61
 
36
- def get(self, name):
37
- if isinstance(name, Schema):
38
- return name
39
- return self.schemata.get(name)
62
+ def get(self, name: Union[str, Schema]) -> Optional[Schema]:
63
+ """Get a schema object based on a schema name. If the input is already
64
+ a schema object, it will just be returned."""
65
+ if isinstance(name, str):
66
+ return self.schemata.get(name)
67
+ return name
40
68
 
41
- def get_qname(self, qname):
42
- if not hasattr(self, '_qnames'):
43
- self._qnames = {p.qname: p for p in self.properties}
44
- return self._qnames.get(qname)
69
+ def get_qname(self, qname: str) -> Optional[Property]:
70
+ """Get a property object based on a qualified name (i.e. schema:property)."""
71
+ return self.qnames.get(qname)
45
72
 
46
- def __getitem__(self, name):
73
+ def __getitem__(self, name: str) -> Schema:
74
+ """Same as get(), but throws an exception when the given name does not exist."""
47
75
  schema = self.get(name)
48
76
  if schema is None:
49
77
  raise KeyError("No such schema: %s" % name)
50
78
  return schema
51
79
 
52
- def make_mapping(self, mapping, key_prefix=None):
80
+ def get_type_schemata(self, type_: PropertyType) -> Set[Schema]:
81
+ """Return all the schemata which have a property of the given type."""
82
+ schemata = set()
83
+ for schema in self.schemata.values():
84
+ for prop in schema.properties.values():
85
+ if prop.type == type_:
86
+ schemata.add(schema)
87
+ return schemata
88
+
89
+ def make_mapping(
90
+ self, mapping: Dict[str, Any], key_prefix: Optional[str] = None
91
+ ) -> QueryMapping:
53
92
  """Parse a mapping that applies (tabular) source data to the model."""
54
93
  return QueryMapping(self, mapping, key_prefix=key_prefix)
55
94
 
56
- def map_entities(self, mapping, key_prefix=None):
95
+ def map_entities(
96
+ self, mapping: Dict[str, Any], key_prefix: Optional[str] = None
97
+ ) -> Generator[EntityProxy, None, None]:
57
98
  """Given a mapping, yield a series of entities from the data source."""
58
- mapping = self.make_mapping(mapping, key_prefix=key_prefix)
59
- for record in mapping.source.records:
60
- for entity in mapping.map(record).values():
99
+ gen = self.make_mapping(mapping, key_prefix=key_prefix)
100
+ for record in gen.source.records:
101
+ for entity in gen.map(record).values():
61
102
  yield entity
62
103
 
63
- def common_schema(self, left, right):
104
+ def common_schema(
105
+ self, left: Union[str, Schema], right: Union[str, Schema]
106
+ ) -> Schema:
64
107
  """Select the most narrow of two schemata.
65
108
 
66
109
  When indexing data from a dataset, an entity may be declared as a
@@ -68,33 +111,42 @@ class Model(object):
68
111
  will select the most specific of two schemata offered. In the example,
69
112
  that would be Person.
70
113
  """
71
- left = self.get(left) or self.get(right)
72
- right = self.get(right) or self.get(left)
73
- left_schemata = list(left.schemata)
74
- right_schemata = list(right.schemata)
75
- if right in left_schemata:
76
- return left
77
- if left in right_schemata:
78
- return right
79
-
80
- # Find a common ancestor:
81
- for left in left_schemata:
82
- for right in right_schemata:
83
- if left == right:
84
- return left
85
-
86
- msg = "No common ancestor: %s and %s"
114
+ left_schema = self.get(left) or self.get(right)
115
+ right_schema = self.get(right) or self.get(left)
116
+ if left_schema is None or right_schema is None:
117
+ raise InvalidData("Invalid schema")
118
+ if left_schema.is_a(right_schema):
119
+ return left_schema
120
+ if right_schema.is_a(left_schema):
121
+ return right_schema
122
+ # for schema in self.schemata.values():
123
+ # if schema.is_a(left) and schema.is_a(right):
124
+ # return schema
125
+ msg = "No common schema: %s and %s"
87
126
  raise InvalidData(msg % (left, right))
88
127
 
89
- def make_entity(self, schema, key_prefix=None):
90
- schema = self.get(schema)
91
- return EntityProxy(schema, None, None, key_prefix=key_prefix)
92
-
93
- def get_proxy(self, data):
94
- return EntityProxy.from_dict(self, data)
95
-
96
- def to_dict(self):
97
- return {n: s.to_dict() for (n, s) in self.schemata.items()}
98
-
99
- def __iter__(self):
128
+ def make_entity(
129
+ self, schema: Union[str, Schema], key_prefix: Optional[str] = None
130
+ ) -> EntityProxy:
131
+ """Instantiate an empty entity proxy of the given schema type."""
132
+ return EntityProxy(self, {"schema": schema}, key_prefix=key_prefix)
133
+
134
+ def get_proxy(self, data: Dict[str, Any], cleaned: bool = True) -> EntityProxy:
135
+ """Create an entity proxy to reflect the entity data in the given
136
+ dictionary. If ``cleaned`` is disabled, all property values are
137
+ fully re-validated and normalised. Use this if handling input data
138
+ from an untrusted source."""
139
+ if isinstance(data, EntityProxy):
140
+ return data
141
+ return EntityProxy.from_dict(self, data, cleaned=cleaned)
142
+
143
+ def to_dict(self) -> ModelToDict:
144
+ """Return metadata for all schemata and properties, in a serializable form."""
145
+ return {
146
+ "schemata": {s.name: s.to_dict() for s in self.schemata.values()},
147
+ "types": {t.name: t.to_dict() for t in registry.types},
148
+ }
149
+
150
+ def __iter__(self) -> Iterator[Schema]:
151
+ """Iterate across all schemata."""
100
152
  return iter(self.schemata.values())
@@ -0,0 +1,119 @@
1
+ """
2
+ *We like our abstractions like our offshore banks: leaky.*
3
+
4
+ Entity ID namespaces are a security mechanism related to the Aleph search index.
5
+
6
+ Aleph allows the user (via mappings or the API) to create arbitrary entity IDs.
7
+ Entity IDs that are controlled by the user and not the system are unusual.
8
+ However, this makes it possible to generate bulk data outside Aleph,
9
+ and then load entities into the system as a continuous :ref:`streams`.
10
+
11
+ The problem is that having user controlled entity IDs increases the chance
12
+ of conflict in the search index.
13
+
14
+ Namespacing works around this by making each entity ID consist of two parts:
15
+ one controlled by the client, the other controlled by the system. The second
16
+ part of the ID is called its `signature`::
17
+
18
+ entity_id.a40a29300ac6bb79dd2f911e77bbda7a3b502126
19
+
20
+ The signature is generated as ``hmac(entity_id, dataset_id)``. This guarantees
21
+ that the combined ID is specific to a dataset, without needing an (expensive)
22
+ index look up of each ID first. It can also be generated on the client or
23
+ the server without compromising isolation.
24
+ """
25
+ import hmac
26
+ from typing import Any, Optional, Tuple, Union
27
+
28
+ from followthemoney.types import registry
29
+ from followthemoney.proxy import E
30
+ from followthemoney.util import key_bytes, get_entity_id
31
+
32
+
33
+ class Namespace(object):
34
+ """Namespaces are used to partition entity IDs into different units,
35
+ which traditionally represent a dataset, collection or source.
36
+
37
+ See module docstring for details."""
38
+
39
+ SEP = "."
40
+
41
+ def __init__(self, name: Optional[str] = None) -> None:
42
+ self.bname = key_bytes(name) if name else b""
43
+ self.hmac = hmac.new(self.bname, digestmod="sha1")
44
+
45
+ @classmethod
46
+ def parse(cls, entity_id: str) -> Tuple[Optional[str], Optional[str]]:
47
+ """Split up an entity ID into the plain ID and the namespace
48
+ signature. If either part is missing, return None instead."""
49
+ clean_id = registry.entity.clean(entity_id)
50
+ if clean_id is None:
51
+ return (None, None)
52
+ try:
53
+ plain_id, checksum = clean_id.rsplit(cls.SEP, 1)
54
+ return (plain_id, checksum)
55
+ except ValueError:
56
+ return (clean_id, None)
57
+
58
+ @classmethod
59
+ def strip(cls, entity_id: str) -> Optional[str]:
60
+ plain_id, _ = cls.parse(entity_id)
61
+ return plain_id
62
+
63
+ def signature(self, entity_id: str) -> Optional[str]:
64
+ """Generate a namespace-specific signature."""
65
+ if not len(self.bname) or entity_id is None:
66
+ return None
67
+ digest = self.hmac.copy()
68
+ digest.update(key_bytes(entity_id))
69
+ return digest.hexdigest()
70
+
71
+ def sign(self, entity_id: str) -> Optional[str]:
72
+ """Apply a namespace signature to an entity ID, removing any
73
+ previous namespace marker."""
74
+ parsed_id, _ = self.parse(entity_id)
75
+ if not len(self.bname):
76
+ return parsed_id
77
+ if parsed_id is None:
78
+ return None
79
+ digest = self.signature(parsed_id)
80
+ if digest is None:
81
+ return None
82
+ return self.SEP.join((parsed_id, digest))
83
+
84
+ def verify(self, entity_id: str) -> bool:
85
+ """Check if the signature matches the current namespace."""
86
+ parsed_id, digest = self.parse(entity_id)
87
+ if digest is None or parsed_id is None:
88
+ return False
89
+ signature = self.signature(parsed_id)
90
+ if signature is None:
91
+ return False
92
+ return hmac.compare_digest(digest, signature)
93
+
94
+ def apply(self, proxy: E, shallow: bool = False) -> E:
95
+ """Rewrite an entity proxy so all IDs mentioned are limited to
96
+ the namespace."""
97
+ signed = proxy.clone()
98
+ signed.id = self.sign(proxy.id)
99
+ if not shallow:
100
+ for prop in proxy.iterprops():
101
+ if prop.type != registry.entity:
102
+ continue
103
+ for value in signed.pop(prop):
104
+ entity_id = get_entity_id(value)
105
+ if entity_id is not None:
106
+ signed.add(prop, self.sign(entity_id))
107
+ return signed
108
+
109
+ @classmethod
110
+ def make(cls, name: Union[str, "Namespace"]) -> "Namespace":
111
+ if isinstance(name, str):
112
+ return cls(name)
113
+ return name
114
+
115
+ def __eq__(self, other: Any) -> bool:
116
+ return bool(self.bname == other.bname)
117
+
118
+ def __repr__(self) -> str:
119
+ return "<Namespace(%r)>" % self.bname
@@ -0,0 +1,48 @@
1
+ from followthemoney.proxy import E
2
+
3
+ # Derived from: https://fsi.taxjustice.net/en/introduction/fsi-results
4
+ OFFSHORE_COUNTRIES = set(
5
+ (
6
+ "ky",
7
+ "ch",
8
+ "sg",
9
+ "lu",
10
+ "vg",
11
+ "gg",
12
+ "pa",
13
+ "je",
14
+ "mt",
15
+ "bs",
16
+ "cy",
17
+ "gi",
18
+ "mo",
19
+ "bm",
20
+ "im",
21
+ "mh",
22
+ "mu",
23
+ "li",
24
+ "ai",
25
+ "kn",
26
+ "tc",
27
+ "vu",
28
+ "mc",
29
+ "sc",
30
+ "ag",
31
+ "dm",
32
+ "ms",
33
+ "lc",
34
+ "ck",
35
+ )
36
+ )
37
+
38
+
39
+ def offshore_from_jurisdiction(proxy: E) -> E:
40
+ """Tag organizations linked to a well-known offshore jurisdiction as
41
+ offshores automatically. Complete generalization, use only in experiments."""
42
+ if not proxy.schema.is_a("Organization"):
43
+ return proxy
44
+ countries = set(proxy.get("country", quiet=True))
45
+ countries.update(proxy.get("jurisdiction", quiet=True))
46
+ if len(countries.intersection(OFFSHORE_COUNTRIES)) > 0:
47
+ proxy.add("topics", "corp.offshore")
48
+ return proxy