followthemoney 1.3.7__py3-none-any.whl → 3.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (186) hide show
  1. followthemoney/__init__.py +5 -3
  2. followthemoney/cli/__init__.py +17 -0
  3. followthemoney/cli/aggregate.py +56 -0
  4. followthemoney/cli/cli.py +88 -0
  5. followthemoney/cli/exports.py +121 -0
  6. followthemoney/cli/mapping.py +85 -0
  7. followthemoney/cli/sieve.py +67 -0
  8. followthemoney/cli/util.py +142 -0
  9. followthemoney/compare.py +130 -60
  10. followthemoney/exc.py +19 -6
  11. followthemoney/export/common.py +29 -0
  12. followthemoney/export/csv.py +82 -0
  13. followthemoney/export/excel.py +75 -0
  14. followthemoney/export/graph.py +79 -0
  15. followthemoney/export/neo4j.py +182 -0
  16. followthemoney/export/rdf.py +26 -0
  17. followthemoney/graph.py +308 -0
  18. followthemoney/helpers.py +212 -0
  19. followthemoney/mapping/__init__.py +1 -1
  20. followthemoney/mapping/csv.py +67 -35
  21. followthemoney/mapping/entity.py +116 -44
  22. followthemoney/mapping/property.py +90 -44
  23. followthemoney/mapping/query.py +27 -19
  24. followthemoney/mapping/source.py +15 -5
  25. followthemoney/mapping/sql.py +75 -61
  26. followthemoney/messages.py +13 -7
  27. followthemoney/model.py +108 -56
  28. followthemoney/namespace.py +119 -0
  29. followthemoney/offshore.py +48 -0
  30. followthemoney/ontology.py +77 -0
  31. followthemoney/property.py +204 -71
  32. followthemoney/proxy.py +455 -118
  33. followthemoney/rdf.py +9 -0
  34. followthemoney/schema/Address.yaml +78 -0
  35. followthemoney/schema/Airplane.yaml +17 -10
  36. followthemoney/schema/Analyzable.yaml +54 -0
  37. followthemoney/schema/Article.yaml +16 -0
  38. followthemoney/schema/Assessment.yaml +32 -0
  39. followthemoney/schema/Asset.yaml +10 -4
  40. followthemoney/schema/Associate.yaml +41 -0
  41. followthemoney/schema/Audio.yaml +24 -0
  42. followthemoney/schema/BankAccount.yaml +53 -9
  43. followthemoney/schema/Call.yaml +48 -0
  44. followthemoney/schema/CallForTenders.yaml +117 -0
  45. followthemoney/schema/Company.yaml +37 -12
  46. followthemoney/schema/Contract.yaml +41 -7
  47. followthemoney/schema/ContractAward.yaml +30 -11
  48. followthemoney/schema/CourtCase.yaml +16 -10
  49. followthemoney/schema/CourtCaseParty.yaml +17 -6
  50. followthemoney/schema/CryptoWallet.yaml +48 -0
  51. followthemoney/schema/Debt.yaml +37 -0
  52. followthemoney/schema/Directorship.yaml +17 -4
  53. followthemoney/schema/Document.yaml +72 -139
  54. followthemoney/schema/Documentation.yml +38 -0
  55. followthemoney/schema/EconomicActivity.yaml +32 -17
  56. followthemoney/schema/Email.yaml +76 -0
  57. followthemoney/schema/Employment.yaml +39 -0
  58. followthemoney/schema/Event.yaml +35 -3
  59. followthemoney/schema/Family.yaml +41 -0
  60. followthemoney/schema/Folder.yaml +13 -0
  61. followthemoney/schema/HyperText.yaml +21 -0
  62. followthemoney/schema/Identification.yaml +40 -0
  63. followthemoney/schema/Image.yaml +25 -0
  64. followthemoney/schema/Interest.yaml +3 -6
  65. followthemoney/schema/Interval.yaml +56 -5
  66. followthemoney/schema/LegalEntity.yaml +81 -20
  67. followthemoney/schema/License.yaml +7 -3
  68. followthemoney/schema/Membership.yaml +19 -4
  69. followthemoney/schema/Mention.yaml +54 -0
  70. followthemoney/schema/Message.yaml +78 -0
  71. followthemoney/schema/Note.yaml +23 -0
  72. followthemoney/schema/Occupancy.yaml +44 -0
  73. followthemoney/schema/Organization.yaml +38 -3
  74. followthemoney/schema/Ownership.yaml +16 -4
  75. followthemoney/schema/Package.yaml +17 -0
  76. followthemoney/schema/Page.yaml +43 -0
  77. followthemoney/schema/Pages.yaml +23 -0
  78. followthemoney/schema/Passport.yaml +16 -17
  79. followthemoney/schema/Payment.yaml +38 -7
  80. followthemoney/schema/Person.yaml +61 -5
  81. followthemoney/schema/PlainText.yaml +17 -0
  82. followthemoney/schema/Position.yaml +50 -0
  83. followthemoney/schema/Post.yaml +42 -0
  84. followthemoney/schema/Project.yaml +27 -0
  85. followthemoney/schema/ProjectParticipant.yaml +36 -0
  86. followthemoney/schema/PublicBody.yaml +14 -3
  87. followthemoney/schema/RealEstate.yaml +19 -3
  88. followthemoney/schema/Representation.yaml +17 -6
  89. followthemoney/schema/Sanction.yaml +45 -21
  90. followthemoney/schema/Security.yaml +59 -0
  91. followthemoney/schema/Similar.yaml +37 -0
  92. followthemoney/schema/Succession.yaml +36 -0
  93. followthemoney/schema/Table.yaml +32 -0
  94. followthemoney/schema/TaxRoll.yaml +27 -9
  95. followthemoney/schema/Thing.yaml +69 -13
  96. followthemoney/schema/Trip.yaml +42 -0
  97. followthemoney/schema/UnknownLink.yaml +17 -6
  98. followthemoney/schema/UserAccount.yaml +44 -0
  99. followthemoney/schema/Value.yaml +5 -1
  100. followthemoney/schema/Vehicle.yaml +25 -8
  101. followthemoney/schema/Vessel.yaml +18 -10
  102. followthemoney/schema/Video.yaml +20 -0
  103. followthemoney/schema/Workbook.yaml +18 -0
  104. followthemoney/schema.py +436 -135
  105. followthemoney/translations/ar/LC_MESSAGES/followthemoney.mo +0 -0
  106. followthemoney/translations/ar/LC_MESSAGES/followthemoney.po +2900 -787
  107. followthemoney/translations/bs/LC_MESSAGES/followthemoney.mo +0 -0
  108. followthemoney/translations/bs/LC_MESSAGES/followthemoney.po +2108 -520
  109. followthemoney/translations/de/LC_MESSAGES/followthemoney.mo +0 -0
  110. followthemoney/translations/de/LC_MESSAGES/followthemoney.po +2902 -782
  111. followthemoney/translations/es/LC_MESSAGES/followthemoney.mo +0 -0
  112. followthemoney/translations/es/LC_MESSAGES/followthemoney.po +2893 -779
  113. followthemoney/translations/fr/LC_MESSAGES/followthemoney.mo +0 -0
  114. followthemoney/translations/fr/LC_MESSAGES/followthemoney.po +4362 -0
  115. followthemoney/translations/fr/followthemoney.po +3861 -0
  116. followthemoney/translations/messages.pot +3021 -725
  117. followthemoney/translations/nb/LC_MESSAGES/followthemoney.mo +0 -0
  118. followthemoney/translations/nb/LC_MESSAGES/followthemoney.po +3778 -0
  119. followthemoney/translations/nl/LC_MESSAGES/followthemoney.mo +0 -0
  120. followthemoney/translations/nl/LC_MESSAGES/followthemoney.po +3837 -0
  121. followthemoney/translations/pt_BR/LC_MESSAGES/followthemoney.mo +0 -0
  122. followthemoney/translations/pt_BR/LC_MESSAGES/followthemoney.po +3784 -0
  123. followthemoney/translations/ru/LC_MESSAGES/followthemoney.mo +0 -0
  124. followthemoney/translations/ru/LC_MESSAGES/followthemoney.po +2837 -539
  125. followthemoney/translations/ru/followthemoney.po +4221 -0
  126. followthemoney/translations/tr/LC_MESSAGES/followthemoney.mo +0 -0
  127. followthemoney/translations/tr/LC_MESSAGES/followthemoney.po +2073 -491
  128. followthemoney/types/__init__.py +35 -17
  129. followthemoney/types/address.py +50 -21
  130. followthemoney/types/checksum.py +25 -0
  131. followthemoney/types/common.py +233 -88
  132. followthemoney/types/country.py +50 -56
  133. followthemoney/types/date.py +59 -76
  134. followthemoney/types/email.py +66 -35
  135. followthemoney/types/entity.py +66 -13
  136. followthemoney/types/gender.py +66 -0
  137. followthemoney/types/iban.py +47 -28
  138. followthemoney/types/identifier.py +49 -22
  139. followthemoney/types/ip.py +35 -21
  140. followthemoney/types/json.py +58 -0
  141. followthemoney/types/language.py +124 -37
  142. followthemoney/types/mimetype.py +44 -0
  143. followthemoney/types/name.py +56 -12
  144. followthemoney/types/number.py +30 -0
  145. followthemoney/types/phone.py +92 -34
  146. followthemoney/types/registry.py +52 -0
  147. followthemoney/types/string.py +43 -0
  148. followthemoney/types/topic.py +94 -0
  149. followthemoney/types/url.py +39 -17
  150. followthemoney/util.py +139 -45
  151. followthemoney-3.8.1.dist-info/METADATA +153 -0
  152. followthemoney-3.8.1.dist-info/RECORD +157 -0
  153. {followthemoney-1.3.7.dist-info → followthemoney-3.8.1.dist-info}/WHEEL +1 -2
  154. followthemoney-3.8.1.dist-info/entry_points.txt +17 -0
  155. followthemoney-1.3.7.dist-info/LICENSE.txt → followthemoney-3.8.1.dist-info/licenses/LICENSE +1 -1
  156. followthemoney/link.py +0 -75
  157. followthemoney/schema/Associate.yml +0 -19
  158. followthemoney/schema/Family.yml +0 -19
  159. followthemoney/schema/Land.yml +0 -9
  160. followthemoney/schema/Relationship.yaml +0 -26
  161. followthemoney/types/domain.py +0 -50
  162. followthemoney-1.3.7.dist-info/DESCRIPTION.rst +0 -3
  163. followthemoney-1.3.7.dist-info/METADATA +0 -39
  164. followthemoney-1.3.7.dist-info/RECORD +0 -108
  165. followthemoney-1.3.7.dist-info/entry_points.txt +0 -3
  166. followthemoney-1.3.7.dist-info/metadata.json +0 -1
  167. followthemoney-1.3.7.dist-info/namespace_packages.txt +0 -1
  168. followthemoney-1.3.7.dist-info/top_level.txt +0 -3
  169. ns/ontology.py +0 -128
  170. tests/types/test_addresses.py +0 -24
  171. tests/types/test_common.py +0 -32
  172. tests/types/test_countries.py +0 -27
  173. tests/types/test_dates.py +0 -73
  174. tests/types/test_domains.py +0 -23
  175. tests/types/test_emails.py +0 -32
  176. tests/types/test_entity.py +0 -19
  177. tests/types/test_iban.py +0 -109
  178. tests/types/test_identifiers.py +0 -27
  179. tests/types/test_ip.py +0 -29
  180. tests/types/test_languages.py +0 -23
  181. tests/types/test_names.py +0 -33
  182. tests/types/test_phones.py +0 -24
  183. tests/types/test_registry.py +0 -14
  184. tests/types/test_urls.py +0 -23
  185. {ns → followthemoney/export}/__init__.py +0 -0
  186. /tests/types/__init__.py → /followthemoney/py.typed +0 -0
@@ -1,95 +1,78 @@
1
- import re
2
1
  import os
3
- import pytz
4
- from rdflib import Literal
5
- from rdflib.namespace import XSD
6
- from normality import stringify
7
- from datetime import datetime, date
2
+ from datetime import datetime, timezone
3
+ from typing import Optional, TYPE_CHECKING
4
+ from prefixdate import parse, parse_format, Precision
8
5
 
9
6
  from followthemoney.types.common import PropertyType
7
+ from followthemoney.rdf import XSD, Literal, Identifier
8
+ from followthemoney.util import defer as _
10
9
  from followthemoney.util import dampen
11
10
 
11
+ if TYPE_CHECKING:
12
+ from followthemoney.proxy import EntityProxy
12
13
 
13
- class DateType(PropertyType):
14
- # JS: '^([12]\\d{3}(-[01]?[1-9](-[0123]?[1-9])?)?)?$'
15
- DATE_RE = re.compile('^([12]\d{3}(-[01]?[0-9](-[0123]?[0-9]([T ]([012]?\d(:\d{1,2}(:\d{1,2}(\.\d{6})?(Z|[-+]\d{2}(:?\d{2})?)?)?)?)?)?)?)?)?$') # noqa
16
- DATE_FULL = re.compile('\d{4}-\d{2}-\d{2}.*')
17
- CUT_ZEROES = re.compile(r'((\-00.*)|(.00:00:00))$')
18
- MONTH_FORMATS = re.compile(r'(%b|%B|%m|%c|%x)')
19
- DAY_FORMATS = re.compile(r'(%d|%w|%c|%x)')
20
- MAX_LENGTH = 19
21
14
 
22
- name = 'date'
23
- group = 'dates'
24
- prefix = 'date'
15
+ class DateType(PropertyType):
16
+ """A date or time stamp. This is based on ISO 8601, but meant to allow for different
17
+ degrees of precision by specifying a prefix. This means that `2021`, `2021-02`,
18
+ `2021-02-16`, `2021-02-16T21`, `2021-02-16T21:48` and `2021-02-16T21:48:52`
19
+ are all valid values, with an implied precision.
25
20
 
26
- def validate(self, obj, **kwargs):
27
- """Check if a thing is a valid date."""
28
- obj = stringify(obj)
29
- if obj is None:
30
- return False
31
- return self.DATE_RE.match(obj) is not None
21
+ The timezone is always expected to be UTC and cannot be specified otherwise. There is
22
+ no support for calendar weeks (`2021-W7`) and date ranges (`2021-2024`)."""
32
23
 
33
- def _clean_datetime(self, obj):
34
- """Python objects want to be text."""
35
- if isinstance(obj, datetime):
36
- # if it's not naive, put it on zulu time first:
37
- if obj.tzinfo is not None:
38
- obj = obj.astimezone(pytz.utc)
39
- return obj.isoformat()[:self.MAX_LENGTH]
40
- if isinstance(obj, date):
41
- return obj.isoformat()
24
+ name = "date"
25
+ group = "dates"
26
+ label = _("Date")
27
+ plural = _("Dates")
28
+ matchable = True
29
+ max_length = 32
42
30
 
43
- def _clean_text(self, text):
44
- # limit to the date part of a presumed date string
45
- # FIXME: this may get us rid of TZ info?
46
- text = text[:self.MAX_LENGTH]
47
- if not self.validate(text):
48
- return None
49
- text = text.replace(' ', 'T')
50
- # fix up dates like 2017-1-5 into 2017-01-05
51
- if not self.DATE_FULL.match(text):
52
- parts = text.split('T', 1)
53
- date = [p.zfill(2) for p in parts[0].split('-')]
54
- parts[0] = '-'.join(date)
55
- text = 'T'.join(parts)
56
- text = text[:self.MAX_LENGTH]
57
- # strip -00-00 from dates because it makes ES barf.
58
- text = self.CUT_ZEROES.sub('', text)
59
- return text
31
+ def validate(
32
+ self, value: str, fuzzy: bool = False, format: Optional[str] = None
33
+ ) -> bool:
34
+ """Check if a thing is a valid date."""
35
+ if format is not None:
36
+ prefix = parse_format(value, format)
37
+ else:
38
+ prefix = parse(value)
39
+ return prefix.precision != Precision.EMPTY
60
40
 
61
- def clean(self, text, format=None, **kwargs):
41
+ def clean_text(
42
+ self,
43
+ text: str,
44
+ fuzzy: bool = False,
45
+ format: Optional[str] = None,
46
+ proxy: Optional["EntityProxy"] = None,
47
+ ) -> Optional[str]:
62
48
  """The classic: date parsing, every which way."""
63
- # handle date/datetime before converting to text.
64
- date = self._clean_datetime(text)
65
- if date is not None:
66
- return date
67
-
68
- text = stringify(text)
69
- if text is None:
70
- return
71
-
72
49
  if format is not None:
73
- # parse with a specified format
74
- try:
75
- obj = datetime.strptime(text, format)
76
- text = obj.date().isoformat()
77
- if self.MONTH_FORMATS.search(format) is None:
78
- text = text[:4]
79
- elif self.DAY_FORMATS.search(format) is None:
80
- text = text[:7]
81
- return text
82
- except Exception:
83
- return None
84
-
85
- return self._clean_text(text)
50
+ return parse_format(text, format).text
51
+ return parse(text).text
86
52
 
87
- def specificity(self, value):
88
- return dampen(5, self.MAX_LENGTH, value) * .5
53
+ def _specificity(self, value: str) -> float:
54
+ return dampen(5, 13, value)
89
55
 
90
- def compare(self, left, right):
56
+ def compare(self, left: str, right: str) -> float:
91
57
  prefix = os.path.commonprefix([left, right])
92
58
  return dampen(4, 10, prefix)
93
59
 
94
- def rdf(self, value):
60
+ def rdf(self, value: str) -> Identifier:
61
+ if len(value) < Precision.HOUR.value:
62
+ return Literal(value, datatype=XSD.date)
95
63
  return Literal(value, datatype=XSD.dateTime)
64
+
65
+ def node_id(self, value: str) -> str:
66
+ return f"date:{value}"
67
+
68
+ def to_datetime(self, value: str) -> Optional[datetime]:
69
+ return parse(value).dt
70
+
71
+ def to_number(self, value: str) -> Optional[float]:
72
+ date = self.to_datetime(value)
73
+ if date is None:
74
+ return None
75
+ # We make a best effort all over the app to ensure all times are in UTC.
76
+ if date.tzinfo is None:
77
+ date = date.replace(tzinfo=timezone.utc)
78
+ return date.timestamp()
@@ -1,54 +1,85 @@
1
1
  import re
2
- from rdflib import URIRef
3
- from normality import stringify
2
+ import logging
3
+ from typing import Optional, TYPE_CHECKING
4
+ from urllib.parse import urlparse
4
5
  from normality.cleaning import strip_quotes
5
6
 
7
+ from followthemoney.rdf import URIRef, Identifier
6
8
  from followthemoney.types.common import PropertyType
7
- from followthemoney.types.domain import DomainType
9
+ from followthemoney.util import sanitize_text, defer as _
10
+
11
+ log = logging.getLogger(__name__)
12
+
13
+ if TYPE_CHECKING:
14
+ from followthemoney.proxy import EntityProxy
8
15
 
9
16
 
10
17
  class EmailType(PropertyType):
11
- EMAIL_REGEX = re.compile(r"[^@]+@[^@]+\.[^@]+")
12
- domains = DomainType()
13
- name = 'email'
14
- group = 'emails'
15
- prefix = 'mail'
16
- strong = True
17
-
18
- def validate(self, email, **kwargs):
18
+ """Internet mail address (e.g. user@example.com). These are notoriously hard
19
+ to validate, but we use an irresponsibly simple rule and hope for the best."""
20
+
21
+ REGEX_RAW = r"^[^@\s]+@[^@\s]+\.\w+$"
22
+ REGEX = re.compile(REGEX_RAW)
23
+ name = "email"
24
+ group = "emails"
25
+ label = _("E-Mail Address")
26
+ plural = _("E-Mail Addresses")
27
+ matchable = True
28
+ pivot = True
29
+
30
+ # def _check_exists(self, domain):
31
+ # """Actually try to resolve a domain name."""
32
+ # try:
33
+ # domain = domain.encode('idna').lower()
34
+ # socket.getaddrinfo(domain, None)
35
+ # return True
36
+ # except:
37
+ # return False
38
+
39
+ def validate(
40
+ self, value: str, fuzzy: bool = False, format: Optional[str] = None
41
+ ) -> bool:
19
42
  """Check to see if this is a valid email address."""
20
- email = stringify(email)
21
- if email is None:
22
- return
23
- if not self.EMAIL_REGEX.match(email):
43
+ # TODO: adopt email.utils.parseaddr
44
+ email = sanitize_text(value)
45
+ if email is None or not self.REGEX.match(email):
24
46
  return False
25
- mailbox, domain = email.rsplit('@', 1)
26
- return self.domains.validate(domain, **kwargs)
47
+ _, domain = email.rsplit("@", 1)
48
+ if len(domain) < 4 or "." not in domain:
49
+ return False
50
+ return True
27
51
 
28
- def clean_text(self, email, **kwargs):
52
+ def clean_text(
53
+ self,
54
+ text: str,
55
+ fuzzy: bool = False,
56
+ format: Optional[str] = None,
57
+ proxy: Optional["EntityProxy"] = None,
58
+ ) -> Optional[str]:
29
59
  """Parse and normalize an email address.
30
60
 
31
61
  Returns None if this is not an email address.
32
62
  """
33
- if not self.EMAIL_REGEX.match(email):
63
+ email = strip_quotes(text)
64
+ if email is None or not self.REGEX.match(email):
34
65
  return None
35
- email = strip_quotes(email)
36
- mailbox, domain = email.rsplit('@', 1)
37
- domain = self.domains.clean(domain, **kwargs)
38
- if domain is None or mailbox is None:
39
- return
40
- return '@'.join((mailbox, domain))
41
-
42
- def normalize(self, email, **kwargs):
43
- """Normalize for comparison."""
44
- emails = super(EmailType, self).normalize(email, **kwargs)
45
- return [e.lower() for e in emails]
46
-
47
- def specificity(self, value):
48
- return 0 if value is None else 1
66
+ mailbox, domain = email.rsplit("@", 1)
67
+ # TODO: https://pypi.python.org/pypi/publicsuffix/
68
+ # handle URLs by extracting the domain name
69
+ domain = urlparse(domain).hostname or domain
70
+ domain = domain.lower()
71
+ domain = domain.rstrip(".")
72
+ # handle unicode
73
+ try:
74
+ domain = domain.encode("idna").decode("ascii")
75
+ except UnicodeError:
76
+ return None
77
+ if domain is not None and mailbox is not None:
78
+ return "@".join((mailbox, domain))
79
+ return None
49
80
 
50
81
  # def country_hint(self, value)
51
82
  # TODO: do we want to use TLDs as country evidence?
52
83
 
53
- def rdf(self, value):
54
- return URIRef('mailto:%s' % value)
84
+ def rdf(self, value: str) -> Identifier:
85
+ return URIRef("mailto:%s" % value.lower())
@@ -1,21 +1,74 @@
1
- from rdflib import URIRef
2
- from banal import is_mapping
1
+ import re
2
+ from typing import Any, Optional, TYPE_CHECKING
3
3
 
4
4
  from followthemoney.types.common import PropertyType
5
+ from followthemoney.rdf import URIRef, Identifier
6
+ from followthemoney.util import ENTITY_ID_LEN, get_entity_id, sanitize_text
7
+ from followthemoney.util import gettext, defer as _
8
+ from followthemoney.exc import InvalidData
9
+
10
+ if TYPE_CHECKING:
11
+ from followthemoney.proxy import EntityProxy
5
12
 
6
13
 
7
14
  class EntityType(PropertyType):
8
- name = 'entity'
9
- group = 'entities'
10
- prefix = 'e'
15
+ """A reference to another entity via its ID. This is how entities in FtM
16
+ become a graph: by pointing at each other using references.
17
+
18
+ Entity IDs can either be `namespaced` or `plain`, depending on the context.
19
+ When setting properties of this type, you can pass in an entity proxy or
20
+ dict of the entity, the ID will then be extracted and stored.
21
+ """
22
+
23
+ REGEX_RAW = r"^[0-9a-zA-Z]([0-9a-zA-Z\.\-]*[0-9a-zA-Z])?$"
24
+ REGEX = re.compile(REGEX_RAW)
25
+ name = "entity"
26
+ group = "entities"
27
+ label = _("Entity")
28
+ plural = _("Entities")
29
+ matchable = True
30
+ pivot = True
31
+ max_length = ENTITY_ID_LEN
32
+
33
+ def validate(
34
+ self, value: str, fuzzy: bool = False, format: Optional[str] = None
35
+ ) -> bool:
36
+ text = sanitize_text(value)
37
+ if text is None:
38
+ return False
39
+ return self.REGEX.match(text) is not None
40
+
41
+ def clean(
42
+ self,
43
+ raw: Any,
44
+ fuzzy: bool = False,
45
+ format: Optional[str] = None,
46
+ proxy: Optional["EntityProxy"] = None,
47
+ ) -> Optional[str]:
48
+ entity_id = get_entity_id(raw)
49
+ if entity_id is None:
50
+ return None
51
+ return self.clean_text(entity_id, fuzzy=fuzzy, format=format, proxy=proxy)
11
52
 
12
- def clean(self, text, **kwargs):
13
- if is_mapping(text):
14
- text = text.get('id')
15
- return super(EntityType, self).clean(text, **kwargs)
53
+ def clean_text(
54
+ self,
55
+ text: str,
56
+ fuzzy: bool = False,
57
+ format: Optional[str] = None,
58
+ proxy: Optional["EntityProxy"] = None,
59
+ ) -> Optional[str]:
60
+ """Specific types can apply their own cleaning routines here (this is called
61
+ by ``clean`` after the value has been converted to a string and null values
62
+ have been filtered)."""
63
+ if proxy is not None and text == proxy.id:
64
+ msg = gettext("Self-relationship (%s): %s")
65
+ raise InvalidData(msg % (proxy.schema, text))
66
+ if self.REGEX.match(text) is not None:
67
+ return text
68
+ return None
16
69
 
17
- def specificity(self, value):
18
- return 1
70
+ def rdf(self, value: str) -> Identifier:
71
+ return URIRef(f"entity:{value}")
19
72
 
20
- def rdf(self, value):
21
- return URIRef('urn:entity:%s' % value)
73
+ def caption(self, value: str) -> None:
74
+ return None
@@ -0,0 +1,66 @@
1
+ from typing import Optional, TYPE_CHECKING
2
+ from babel.core import Locale
3
+
4
+ from followthemoney.types.common import EnumType, EnumValues
5
+ from followthemoney.rdf import URIRef, Identifier
6
+ from followthemoney.util import gettext, defer as _
7
+
8
+ if TYPE_CHECKING:
9
+ from followthemoney.proxy import EntityProxy
10
+
11
+
12
+ class GenderType(EnumType):
13
+ """A human gender. This is not meant to be a comprehensive model of
14
+ the social realities of gender but a way to capture data from (mostly)
15
+ government databases and represent it in a way that can be used by
16
+ structured tools. I'm not sure this justifies the simplification."""
17
+
18
+ MALE = "male"
19
+ FEMALE = "female"
20
+ OTHER = "other"
21
+
22
+ LOOKUP = {
23
+ "m": MALE,
24
+ "man": MALE,
25
+ "masculin": MALE,
26
+ "männlich": MALE,
27
+ "мужской": MALE,
28
+ "f": FEMALE,
29
+ "woman": FEMALE,
30
+ "féminin": FEMALE,
31
+ "weiblich": FEMALE,
32
+ "женский": FEMALE,
33
+ "o": OTHER,
34
+ "d": OTHER,
35
+ "divers": OTHER,
36
+ }
37
+
38
+ name = "gender"
39
+ group = "genders"
40
+ label = _("Gender")
41
+ plural = _("Genders")
42
+ matchable = False
43
+ max_length = 16
44
+
45
+ def _locale_names(self, locale: Locale) -> EnumValues:
46
+ return {
47
+ self.MALE: gettext("male"),
48
+ self.FEMALE: gettext("female"),
49
+ self.OTHER: gettext("other"),
50
+ }
51
+
52
+ def clean_text(
53
+ self,
54
+ text: str,
55
+ fuzzy: bool = False,
56
+ format: Optional[str] = None,
57
+ proxy: Optional["EntityProxy"] = None,
58
+ ) -> Optional[str]:
59
+ code = text.lower().strip()
60
+ code = self.LOOKUP.get(code, code)
61
+ if code not in self.codes:
62
+ return None
63
+ return code
64
+
65
+ def rdf(self, value: str) -> Identifier:
66
+ return URIRef(f"gender:{value}")
@@ -1,39 +1,58 @@
1
- from rdflib import URIRef
2
- from normality import stringify
3
- from schwifty import IBAN
1
+ from typing import Optional, TYPE_CHECKING
2
+ from rigour.ids import IBAN
4
3
 
5
4
  from followthemoney.types.common import PropertyType
5
+ from followthemoney.rdf import URIRef, Identifier
6
+ from followthemoney.util import sanitize_text, defer as _
7
+
8
+ if TYPE_CHECKING:
9
+ from followthemoney.proxy import EntityProxy
6
10
 
7
11
 
8
12
  class IbanType(PropertyType):
9
- name = 'iban'
10
- group = 'ibans'
11
- prefix = 'iban'
12
- strong = False
13
-
14
- def validate(self, iban, **kwargs):
15
- iban = stringify(iban)
16
- if iban is None:
17
- return False
18
- try:
19
- IBAN(iban)
20
- return True
21
- except ValueError as ex:
22
- print(ex)
13
+ """An international bank account number, as defined in ISO 13616. IBANs are
14
+ managed by SWIFT used in the European SEPA payment system.
15
+
16
+ A notable aspect of IBANs is that they share a country prefix and validation
17
+ mechanism, but the specific length of an IBAN is dependent on the country
18
+ code defined in the first two characters: `NO8330001234567` and
19
+ `CY21002001950000357001234567` are both valid values."""
20
+
21
+ name = "iban"
22
+ group = "ibans"
23
+ label = _("IBAN")
24
+ plural = _("IBANs")
25
+ matchable = True
26
+ pivot = True
27
+ max_length = 64
28
+
29
+ def validate(
30
+ self, value: str, fuzzy: bool = False, format: Optional[str] = None
31
+ ) -> bool:
32
+ text = sanitize_text(value)
33
+ if text is None:
23
34
  return False
24
-
25
- def clean_text(self, text, **kwargs):
35
+ return IBAN.is_valid(text)
36
+
37
+ def clean_text(
38
+ self,
39
+ text: str,
40
+ fuzzy: bool = False,
41
+ format: Optional[str] = None,
42
+ proxy: Optional["EntityProxy"] = None,
43
+ ) -> Optional[str]:
26
44
  """Create a more clean, but still user-facing version of an
27
45
  instance of the type."""
28
- return text.replace(" ", "").upper()
46
+ return IBAN.normalize(text)
47
+
48
+ def country_hint(self, value: str) -> str:
49
+ return value[:2].lower()
29
50
 
30
- def specificity(self, value):
31
- return 1
51
+ def rdf(self, value: str) -> Identifier:
52
+ return URIRef(self.node_id(value))
32
53
 
33
- def country_hint(self, value):
34
- value = stringify(value)
35
- if value is not None:
36
- return value[:2].lower()
54
+ def node_id(self, value: str) -> str:
55
+ return f"iban:{value.upper()}"
37
56
 
38
- def rdf(self, value):
39
- return URIRef('iban:%s' % value)
57
+ def caption(self, value: str) -> str:
58
+ return IBAN.format(value)
@@ -1,34 +1,61 @@
1
1
  import re
2
- from normality import normalize
2
+ from typing import Optional, TYPE_CHECKING
3
+ from rigour.ids import get_identifier_format_names, get_identifier_format
3
4
 
4
5
  from followthemoney.types.common import PropertyType
6
+ from followthemoney.util import dampen, shortest, longest
7
+ from followthemoney.util import defer as _
8
+
9
+ if TYPE_CHECKING:
10
+ from followthemoney.proxy import EntityProxy
5
11
 
6
12
 
7
13
  class IdentifierType(PropertyType):
8
- """Used for registration numbers, codes etc."""
9
- COMPARE_CLEAN = re.compile('[\W_]+')
10
- name = 'identifier'
11
- group = 'identifiers'
12
- prefix = 'ident'
13
- strong = False
14
-
15
- def normalize(self, text, **kwargs):
16
- """Normalize for comparison."""
17
- ids = super(IdentifierType, self).normalize(text, **kwargs)
18
- return [normalize(i) for i in ids]
19
-
20
- def clean_compare(self, value):
14
+ """Used for registration numbers and other codes assigned by an authority
15
+ to identify an entity. This might include tax identifiers and statistical
16
+ codes.
17
+
18
+ Since identifiers are high-value criteria when comparing two entities, numbers
19
+ should only be modelled as identifiers if they are long enough to be meaningful.
20
+ Four- or five-digit industry classifiers create more noise than value."""
21
+
22
+ COMPARE_CLEAN = re.compile(r"[\W_]+")
23
+ name = "identifier"
24
+ group = "identifiers"
25
+ label = _("Identifier")
26
+ plural = _("Identifiers")
27
+ matchable = True
28
+ pivot = True
29
+ max_length = 64
30
+
31
+ def clean_text(
32
+ self,
33
+ text: str,
34
+ fuzzy: bool = False,
35
+ format: Optional[str] = None,
36
+ proxy: Optional["EntityProxy"] = None,
37
+ ) -> Optional[str]:
38
+ if format in get_identifier_format_names():
39
+ format_ = get_identifier_format(format)
40
+ return format_.normalize(text)
41
+ return text
42
+
43
+ def clean_compare(self, value: str) -> str:
21
44
  # TODO: should this be used for normalization?
22
- value = self.COMPARE_CLEAN.sub('', value)
45
+ value = self.COMPARE_CLEAN.sub("", value)
23
46
  return value.lower()
24
47
 
25
- def compare(self, left, right):
48
+ def compare(self, left: str, right: str) -> float:
26
49
  left = self.clean_compare(left)
27
50
  right = self.clean_compare(right)
28
51
  if left == right:
29
- return .9
30
- if left in right:
31
- return .7
32
- if right in left:
33
- return .7
34
- return 0
52
+ return 1.0
53
+ elif left in right or right in left:
54
+ return len(shortest(left, right)) / len(longest(left, right))
55
+ return 0.0
56
+
57
+ def _specificity(self, value: str) -> float:
58
+ return dampen(4, 10, value)
59
+
60
+ def node_id(self, value: str) -> str:
61
+ return f"id:{value}"