followthemoney 1.3.7__py3-none-any.whl → 3.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- followthemoney/__init__.py +5 -3
- followthemoney/cli/__init__.py +17 -0
- followthemoney/cli/aggregate.py +56 -0
- followthemoney/cli/cli.py +88 -0
- followthemoney/cli/exports.py +121 -0
- followthemoney/cli/mapping.py +85 -0
- followthemoney/cli/sieve.py +67 -0
- followthemoney/cli/util.py +142 -0
- followthemoney/compare.py +130 -60
- followthemoney/exc.py +19 -6
- followthemoney/export/common.py +29 -0
- followthemoney/export/csv.py +82 -0
- followthemoney/export/excel.py +75 -0
- followthemoney/export/graph.py +79 -0
- followthemoney/export/neo4j.py +182 -0
- followthemoney/export/rdf.py +26 -0
- followthemoney/graph.py +308 -0
- followthemoney/helpers.py +212 -0
- followthemoney/mapping/__init__.py +1 -1
- followthemoney/mapping/csv.py +67 -35
- followthemoney/mapping/entity.py +116 -44
- followthemoney/mapping/property.py +90 -44
- followthemoney/mapping/query.py +27 -19
- followthemoney/mapping/source.py +15 -5
- followthemoney/mapping/sql.py +75 -61
- followthemoney/messages.py +13 -7
- followthemoney/model.py +108 -56
- followthemoney/namespace.py +119 -0
- followthemoney/offshore.py +48 -0
- followthemoney/ontology.py +77 -0
- followthemoney/property.py +204 -71
- followthemoney/proxy.py +455 -118
- followthemoney/rdf.py +9 -0
- followthemoney/schema/Address.yaml +78 -0
- followthemoney/schema/Airplane.yaml +17 -10
- followthemoney/schema/Analyzable.yaml +54 -0
- followthemoney/schema/Article.yaml +16 -0
- followthemoney/schema/Assessment.yaml +32 -0
- followthemoney/schema/Asset.yaml +10 -4
- followthemoney/schema/Associate.yaml +41 -0
- followthemoney/schema/Audio.yaml +24 -0
- followthemoney/schema/BankAccount.yaml +53 -9
- followthemoney/schema/Call.yaml +48 -0
- followthemoney/schema/CallForTenders.yaml +117 -0
- followthemoney/schema/Company.yaml +37 -12
- followthemoney/schema/Contract.yaml +41 -7
- followthemoney/schema/ContractAward.yaml +30 -11
- followthemoney/schema/CourtCase.yaml +16 -10
- followthemoney/schema/CourtCaseParty.yaml +17 -6
- followthemoney/schema/CryptoWallet.yaml +48 -0
- followthemoney/schema/Debt.yaml +37 -0
- followthemoney/schema/Directorship.yaml +17 -4
- followthemoney/schema/Document.yaml +72 -139
- followthemoney/schema/Documentation.yml +38 -0
- followthemoney/schema/EconomicActivity.yaml +32 -17
- followthemoney/schema/Email.yaml +76 -0
- followthemoney/schema/Employment.yaml +39 -0
- followthemoney/schema/Event.yaml +35 -3
- followthemoney/schema/Family.yaml +41 -0
- followthemoney/schema/Folder.yaml +13 -0
- followthemoney/schema/HyperText.yaml +21 -0
- followthemoney/schema/Identification.yaml +40 -0
- followthemoney/schema/Image.yaml +25 -0
- followthemoney/schema/Interest.yaml +3 -6
- followthemoney/schema/Interval.yaml +56 -5
- followthemoney/schema/LegalEntity.yaml +81 -20
- followthemoney/schema/License.yaml +7 -3
- followthemoney/schema/Membership.yaml +19 -4
- followthemoney/schema/Mention.yaml +54 -0
- followthemoney/schema/Message.yaml +78 -0
- followthemoney/schema/Note.yaml +23 -0
- followthemoney/schema/Occupancy.yaml +44 -0
- followthemoney/schema/Organization.yaml +38 -3
- followthemoney/schema/Ownership.yaml +16 -4
- followthemoney/schema/Package.yaml +17 -0
- followthemoney/schema/Page.yaml +43 -0
- followthemoney/schema/Pages.yaml +23 -0
- followthemoney/schema/Passport.yaml +16 -17
- followthemoney/schema/Payment.yaml +38 -7
- followthemoney/schema/Person.yaml +61 -5
- followthemoney/schema/PlainText.yaml +17 -0
- followthemoney/schema/Position.yaml +50 -0
- followthemoney/schema/Post.yaml +42 -0
- followthemoney/schema/Project.yaml +27 -0
- followthemoney/schema/ProjectParticipant.yaml +36 -0
- followthemoney/schema/PublicBody.yaml +14 -3
- followthemoney/schema/RealEstate.yaml +19 -3
- followthemoney/schema/Representation.yaml +17 -6
- followthemoney/schema/Sanction.yaml +45 -21
- followthemoney/schema/Security.yaml +59 -0
- followthemoney/schema/Similar.yaml +37 -0
- followthemoney/schema/Succession.yaml +36 -0
- followthemoney/schema/Table.yaml +32 -0
- followthemoney/schema/TaxRoll.yaml +27 -9
- followthemoney/schema/Thing.yaml +69 -13
- followthemoney/schema/Trip.yaml +42 -0
- followthemoney/schema/UnknownLink.yaml +17 -6
- followthemoney/schema/UserAccount.yaml +44 -0
- followthemoney/schema/Value.yaml +5 -1
- followthemoney/schema/Vehicle.yaml +25 -8
- followthemoney/schema/Vessel.yaml +18 -10
- followthemoney/schema/Video.yaml +20 -0
- followthemoney/schema/Workbook.yaml +18 -0
- followthemoney/schema.py +436 -135
- followthemoney/translations/ar/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/ar/LC_MESSAGES/followthemoney.po +2900 -787
- followthemoney/translations/bs/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/bs/LC_MESSAGES/followthemoney.po +2108 -520
- followthemoney/translations/de/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/de/LC_MESSAGES/followthemoney.po +2902 -782
- followthemoney/translations/es/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/es/LC_MESSAGES/followthemoney.po +2893 -779
- followthemoney/translations/fr/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/fr/LC_MESSAGES/followthemoney.po +4362 -0
- followthemoney/translations/fr/followthemoney.po +3861 -0
- followthemoney/translations/messages.pot +3021 -725
- followthemoney/translations/nb/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/nb/LC_MESSAGES/followthemoney.po +3778 -0
- followthemoney/translations/nl/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/nl/LC_MESSAGES/followthemoney.po +3837 -0
- followthemoney/translations/pt_BR/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/pt_BR/LC_MESSAGES/followthemoney.po +3784 -0
- followthemoney/translations/ru/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/ru/LC_MESSAGES/followthemoney.po +2837 -539
- followthemoney/translations/ru/followthemoney.po +4221 -0
- followthemoney/translations/tr/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/tr/LC_MESSAGES/followthemoney.po +2073 -491
- followthemoney/types/__init__.py +35 -17
- followthemoney/types/address.py +50 -21
- followthemoney/types/checksum.py +25 -0
- followthemoney/types/common.py +233 -88
- followthemoney/types/country.py +50 -56
- followthemoney/types/date.py +59 -76
- followthemoney/types/email.py +66 -35
- followthemoney/types/entity.py +66 -13
- followthemoney/types/gender.py +66 -0
- followthemoney/types/iban.py +47 -28
- followthemoney/types/identifier.py +49 -22
- followthemoney/types/ip.py +35 -21
- followthemoney/types/json.py +58 -0
- followthemoney/types/language.py +124 -37
- followthemoney/types/mimetype.py +44 -0
- followthemoney/types/name.py +56 -12
- followthemoney/types/number.py +30 -0
- followthemoney/types/phone.py +92 -34
- followthemoney/types/registry.py +52 -0
- followthemoney/types/string.py +43 -0
- followthemoney/types/topic.py +94 -0
- followthemoney/types/url.py +39 -17
- followthemoney/util.py +139 -45
- followthemoney-3.8.1.dist-info/METADATA +153 -0
- followthemoney-3.8.1.dist-info/RECORD +157 -0
- {followthemoney-1.3.7.dist-info → followthemoney-3.8.1.dist-info}/WHEEL +1 -2
- followthemoney-3.8.1.dist-info/entry_points.txt +17 -0
- followthemoney-1.3.7.dist-info/LICENSE.txt → followthemoney-3.8.1.dist-info/licenses/LICENSE +1 -1
- followthemoney/link.py +0 -75
- followthemoney/schema/Associate.yml +0 -19
- followthemoney/schema/Family.yml +0 -19
- followthemoney/schema/Land.yml +0 -9
- followthemoney/schema/Relationship.yaml +0 -26
- followthemoney/types/domain.py +0 -50
- followthemoney-1.3.7.dist-info/DESCRIPTION.rst +0 -3
- followthemoney-1.3.7.dist-info/METADATA +0 -39
- followthemoney-1.3.7.dist-info/RECORD +0 -108
- followthemoney-1.3.7.dist-info/entry_points.txt +0 -3
- followthemoney-1.3.7.dist-info/metadata.json +0 -1
- followthemoney-1.3.7.dist-info/namespace_packages.txt +0 -1
- followthemoney-1.3.7.dist-info/top_level.txt +0 -3
- ns/ontology.py +0 -128
- tests/types/test_addresses.py +0 -24
- tests/types/test_common.py +0 -32
- tests/types/test_countries.py +0 -27
- tests/types/test_dates.py +0 -73
- tests/types/test_domains.py +0 -23
- tests/types/test_emails.py +0 -32
- tests/types/test_entity.py +0 -19
- tests/types/test_iban.py +0 -109
- tests/types/test_identifiers.py +0 -27
- tests/types/test_ip.py +0 -29
- tests/types/test_languages.py +0 -23
- tests/types/test_names.py +0 -33
- tests/types/test_phones.py +0 -24
- tests/types/test_registry.py +0 -14
- tests/types/test_urls.py +0 -23
- {ns → followthemoney/export}/__init__.py +0 -0
- /tests/types/__init__.py → /followthemoney/py.typed +0 -0
followthemoney/types/date.py
CHANGED
|
@@ -1,95 +1,78 @@
|
|
|
1
|
-
import re
|
|
2
1
|
import os
|
|
3
|
-
import
|
|
4
|
-
from
|
|
5
|
-
from
|
|
6
|
-
from normality import stringify
|
|
7
|
-
from datetime import datetime, date
|
|
2
|
+
from datetime import datetime, timezone
|
|
3
|
+
from typing import Optional, TYPE_CHECKING
|
|
4
|
+
from prefixdate import parse, parse_format, Precision
|
|
8
5
|
|
|
9
6
|
from followthemoney.types.common import PropertyType
|
|
7
|
+
from followthemoney.rdf import XSD, Literal, Identifier
|
|
8
|
+
from followthemoney.util import defer as _
|
|
10
9
|
from followthemoney.util import dampen
|
|
11
10
|
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from followthemoney.proxy import EntityProxy
|
|
12
13
|
|
|
13
|
-
class DateType(PropertyType):
|
|
14
|
-
# JS: '^([12]\\d{3}(-[01]?[1-9](-[0123]?[1-9])?)?)?$'
|
|
15
|
-
DATE_RE = re.compile('^([12]\d{3}(-[01]?[0-9](-[0123]?[0-9]([T ]([012]?\d(:\d{1,2}(:\d{1,2}(\.\d{6})?(Z|[-+]\d{2}(:?\d{2})?)?)?)?)?)?)?)?)?$') # noqa
|
|
16
|
-
DATE_FULL = re.compile('\d{4}-\d{2}-\d{2}.*')
|
|
17
|
-
CUT_ZEROES = re.compile(r'((\-00.*)|(.00:00:00))$')
|
|
18
|
-
MONTH_FORMATS = re.compile(r'(%b|%B|%m|%c|%x)')
|
|
19
|
-
DAY_FORMATS = re.compile(r'(%d|%w|%c|%x)')
|
|
20
|
-
MAX_LENGTH = 19
|
|
21
14
|
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
prefix
|
|
15
|
+
class DateType(PropertyType):
|
|
16
|
+
"""A date or time stamp. This is based on ISO 8601, but meant to allow for different
|
|
17
|
+
degrees of precision by specifying a prefix. This means that `2021`, `2021-02`,
|
|
18
|
+
`2021-02-16`, `2021-02-16T21`, `2021-02-16T21:48` and `2021-02-16T21:48:52`
|
|
19
|
+
are all valid values, with an implied precision.
|
|
25
20
|
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
obj = stringify(obj)
|
|
29
|
-
if obj is None:
|
|
30
|
-
return False
|
|
31
|
-
return self.DATE_RE.match(obj) is not None
|
|
21
|
+
The timezone is always expected to be UTC and cannot be specified otherwise. There is
|
|
22
|
+
no support for calendar weeks (`2021-W7`) and date ranges (`2021-2024`)."""
|
|
32
23
|
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
return obj.isoformat()[:self.MAX_LENGTH]
|
|
40
|
-
if isinstance(obj, date):
|
|
41
|
-
return obj.isoformat()
|
|
24
|
+
name = "date"
|
|
25
|
+
group = "dates"
|
|
26
|
+
label = _("Date")
|
|
27
|
+
plural = _("Dates")
|
|
28
|
+
matchable = True
|
|
29
|
+
max_length = 32
|
|
42
30
|
|
|
43
|
-
def
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
if not
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
parts = text.split('T', 1)
|
|
53
|
-
date = [p.zfill(2) for p in parts[0].split('-')]
|
|
54
|
-
parts[0] = '-'.join(date)
|
|
55
|
-
text = 'T'.join(parts)
|
|
56
|
-
text = text[:self.MAX_LENGTH]
|
|
57
|
-
# strip -00-00 from dates because it makes ES barf.
|
|
58
|
-
text = self.CUT_ZEROES.sub('', text)
|
|
59
|
-
return text
|
|
31
|
+
def validate(
|
|
32
|
+
self, value: str, fuzzy: bool = False, format: Optional[str] = None
|
|
33
|
+
) -> bool:
|
|
34
|
+
"""Check if a thing is a valid date."""
|
|
35
|
+
if format is not None:
|
|
36
|
+
prefix = parse_format(value, format)
|
|
37
|
+
else:
|
|
38
|
+
prefix = parse(value)
|
|
39
|
+
return prefix.precision != Precision.EMPTY
|
|
60
40
|
|
|
61
|
-
def
|
|
41
|
+
def clean_text(
|
|
42
|
+
self,
|
|
43
|
+
text: str,
|
|
44
|
+
fuzzy: bool = False,
|
|
45
|
+
format: Optional[str] = None,
|
|
46
|
+
proxy: Optional["EntityProxy"] = None,
|
|
47
|
+
) -> Optional[str]:
|
|
62
48
|
"""The classic: date parsing, every which way."""
|
|
63
|
-
# handle date/datetime before converting to text.
|
|
64
|
-
date = self._clean_datetime(text)
|
|
65
|
-
if date is not None:
|
|
66
|
-
return date
|
|
67
|
-
|
|
68
|
-
text = stringify(text)
|
|
69
|
-
if text is None:
|
|
70
|
-
return
|
|
71
|
-
|
|
72
49
|
if format is not None:
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
obj = datetime.strptime(text, format)
|
|
76
|
-
text = obj.date().isoformat()
|
|
77
|
-
if self.MONTH_FORMATS.search(format) is None:
|
|
78
|
-
text = text[:4]
|
|
79
|
-
elif self.DAY_FORMATS.search(format) is None:
|
|
80
|
-
text = text[:7]
|
|
81
|
-
return text
|
|
82
|
-
except Exception:
|
|
83
|
-
return None
|
|
84
|
-
|
|
85
|
-
return self._clean_text(text)
|
|
50
|
+
return parse_format(text, format).text
|
|
51
|
+
return parse(text).text
|
|
86
52
|
|
|
87
|
-
def
|
|
88
|
-
return dampen(5,
|
|
53
|
+
def _specificity(self, value: str) -> float:
|
|
54
|
+
return dampen(5, 13, value)
|
|
89
55
|
|
|
90
|
-
def compare(self, left, right):
|
|
56
|
+
def compare(self, left: str, right: str) -> float:
|
|
91
57
|
prefix = os.path.commonprefix([left, right])
|
|
92
58
|
return dampen(4, 10, prefix)
|
|
93
59
|
|
|
94
|
-
def rdf(self, value):
|
|
60
|
+
def rdf(self, value: str) -> Identifier:
|
|
61
|
+
if len(value) < Precision.HOUR.value:
|
|
62
|
+
return Literal(value, datatype=XSD.date)
|
|
95
63
|
return Literal(value, datatype=XSD.dateTime)
|
|
64
|
+
|
|
65
|
+
def node_id(self, value: str) -> str:
|
|
66
|
+
return f"date:{value}"
|
|
67
|
+
|
|
68
|
+
def to_datetime(self, value: str) -> Optional[datetime]:
|
|
69
|
+
return parse(value).dt
|
|
70
|
+
|
|
71
|
+
def to_number(self, value: str) -> Optional[float]:
|
|
72
|
+
date = self.to_datetime(value)
|
|
73
|
+
if date is None:
|
|
74
|
+
return None
|
|
75
|
+
# We make a best effort all over the app to ensure all times are in UTC.
|
|
76
|
+
if date.tzinfo is None:
|
|
77
|
+
date = date.replace(tzinfo=timezone.utc)
|
|
78
|
+
return date.timestamp()
|
followthemoney/types/email.py
CHANGED
|
@@ -1,54 +1,85 @@
|
|
|
1
1
|
import re
|
|
2
|
-
|
|
3
|
-
from
|
|
2
|
+
import logging
|
|
3
|
+
from typing import Optional, TYPE_CHECKING
|
|
4
|
+
from urllib.parse import urlparse
|
|
4
5
|
from normality.cleaning import strip_quotes
|
|
5
6
|
|
|
7
|
+
from followthemoney.rdf import URIRef, Identifier
|
|
6
8
|
from followthemoney.types.common import PropertyType
|
|
7
|
-
from followthemoney.
|
|
9
|
+
from followthemoney.util import sanitize_text, defer as _
|
|
10
|
+
|
|
11
|
+
log = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from followthemoney.proxy import EntityProxy
|
|
8
15
|
|
|
9
16
|
|
|
10
17
|
class EmailType(PropertyType):
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
18
|
+
"""Internet mail address (e.g. user@example.com). These are notoriously hard
|
|
19
|
+
to validate, but we use an irresponsibly simple rule and hope for the best."""
|
|
20
|
+
|
|
21
|
+
REGEX_RAW = r"^[^@\s]+@[^@\s]+\.\w+$"
|
|
22
|
+
REGEX = re.compile(REGEX_RAW)
|
|
23
|
+
name = "email"
|
|
24
|
+
group = "emails"
|
|
25
|
+
label = _("E-Mail Address")
|
|
26
|
+
plural = _("E-Mail Addresses")
|
|
27
|
+
matchable = True
|
|
28
|
+
pivot = True
|
|
29
|
+
|
|
30
|
+
# def _check_exists(self, domain):
|
|
31
|
+
# """Actually try to resolve a domain name."""
|
|
32
|
+
# try:
|
|
33
|
+
# domain = domain.encode('idna').lower()
|
|
34
|
+
# socket.getaddrinfo(domain, None)
|
|
35
|
+
# return True
|
|
36
|
+
# except:
|
|
37
|
+
# return False
|
|
38
|
+
|
|
39
|
+
def validate(
|
|
40
|
+
self, value: str, fuzzy: bool = False, format: Optional[str] = None
|
|
41
|
+
) -> bool:
|
|
19
42
|
"""Check to see if this is a valid email address."""
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
if not self.EMAIL_REGEX.match(email):
|
|
43
|
+
# TODO: adopt email.utils.parseaddr
|
|
44
|
+
email = sanitize_text(value)
|
|
45
|
+
if email is None or not self.REGEX.match(email):
|
|
24
46
|
return False
|
|
25
|
-
|
|
26
|
-
|
|
47
|
+
_, domain = email.rsplit("@", 1)
|
|
48
|
+
if len(domain) < 4 or "." not in domain:
|
|
49
|
+
return False
|
|
50
|
+
return True
|
|
27
51
|
|
|
28
|
-
def clean_text(
|
|
52
|
+
def clean_text(
|
|
53
|
+
self,
|
|
54
|
+
text: str,
|
|
55
|
+
fuzzy: bool = False,
|
|
56
|
+
format: Optional[str] = None,
|
|
57
|
+
proxy: Optional["EntityProxy"] = None,
|
|
58
|
+
) -> Optional[str]:
|
|
29
59
|
"""Parse and normalize an email address.
|
|
30
60
|
|
|
31
61
|
Returns None if this is not an email address.
|
|
32
62
|
"""
|
|
33
|
-
|
|
63
|
+
email = strip_quotes(text)
|
|
64
|
+
if email is None or not self.REGEX.match(email):
|
|
34
65
|
return None
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
return
|
|
66
|
+
mailbox, domain = email.rsplit("@", 1)
|
|
67
|
+
# TODO: https://pypi.python.org/pypi/publicsuffix/
|
|
68
|
+
# handle URLs by extracting the domain name
|
|
69
|
+
domain = urlparse(domain).hostname or domain
|
|
70
|
+
domain = domain.lower()
|
|
71
|
+
domain = domain.rstrip(".")
|
|
72
|
+
# handle unicode
|
|
73
|
+
try:
|
|
74
|
+
domain = domain.encode("idna").decode("ascii")
|
|
75
|
+
except UnicodeError:
|
|
76
|
+
return None
|
|
77
|
+
if domain is not None and mailbox is not None:
|
|
78
|
+
return "@".join((mailbox, domain))
|
|
79
|
+
return None
|
|
49
80
|
|
|
50
81
|
# def country_hint(self, value)
|
|
51
82
|
# TODO: do we want to use TLDs as country evidence?
|
|
52
83
|
|
|
53
|
-
def rdf(self, value):
|
|
54
|
-
return URIRef(
|
|
84
|
+
def rdf(self, value: str) -> Identifier:
|
|
85
|
+
return URIRef("mailto:%s" % value.lower())
|
followthemoney/types/entity.py
CHANGED
|
@@ -1,21 +1,74 @@
|
|
|
1
|
-
|
|
2
|
-
from
|
|
1
|
+
import re
|
|
2
|
+
from typing import Any, Optional, TYPE_CHECKING
|
|
3
3
|
|
|
4
4
|
from followthemoney.types.common import PropertyType
|
|
5
|
+
from followthemoney.rdf import URIRef, Identifier
|
|
6
|
+
from followthemoney.util import ENTITY_ID_LEN, get_entity_id, sanitize_text
|
|
7
|
+
from followthemoney.util import gettext, defer as _
|
|
8
|
+
from followthemoney.exc import InvalidData
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from followthemoney.proxy import EntityProxy
|
|
5
12
|
|
|
6
13
|
|
|
7
14
|
class EntityType(PropertyType):
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
15
|
+
"""A reference to another entity via its ID. This is how entities in FtM
|
|
16
|
+
become a graph: by pointing at each other using references.
|
|
17
|
+
|
|
18
|
+
Entity IDs can either be `namespaced` or `plain`, depending on the context.
|
|
19
|
+
When setting properties of this type, you can pass in an entity proxy or
|
|
20
|
+
dict of the entity, the ID will then be extracted and stored.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
REGEX_RAW = r"^[0-9a-zA-Z]([0-9a-zA-Z\.\-]*[0-9a-zA-Z])?$"
|
|
24
|
+
REGEX = re.compile(REGEX_RAW)
|
|
25
|
+
name = "entity"
|
|
26
|
+
group = "entities"
|
|
27
|
+
label = _("Entity")
|
|
28
|
+
plural = _("Entities")
|
|
29
|
+
matchable = True
|
|
30
|
+
pivot = True
|
|
31
|
+
max_length = ENTITY_ID_LEN
|
|
32
|
+
|
|
33
|
+
def validate(
|
|
34
|
+
self, value: str, fuzzy: bool = False, format: Optional[str] = None
|
|
35
|
+
) -> bool:
|
|
36
|
+
text = sanitize_text(value)
|
|
37
|
+
if text is None:
|
|
38
|
+
return False
|
|
39
|
+
return self.REGEX.match(text) is not None
|
|
40
|
+
|
|
41
|
+
def clean(
|
|
42
|
+
self,
|
|
43
|
+
raw: Any,
|
|
44
|
+
fuzzy: bool = False,
|
|
45
|
+
format: Optional[str] = None,
|
|
46
|
+
proxy: Optional["EntityProxy"] = None,
|
|
47
|
+
) -> Optional[str]:
|
|
48
|
+
entity_id = get_entity_id(raw)
|
|
49
|
+
if entity_id is None:
|
|
50
|
+
return None
|
|
51
|
+
return self.clean_text(entity_id, fuzzy=fuzzy, format=format, proxy=proxy)
|
|
11
52
|
|
|
12
|
-
def
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
53
|
+
def clean_text(
|
|
54
|
+
self,
|
|
55
|
+
text: str,
|
|
56
|
+
fuzzy: bool = False,
|
|
57
|
+
format: Optional[str] = None,
|
|
58
|
+
proxy: Optional["EntityProxy"] = None,
|
|
59
|
+
) -> Optional[str]:
|
|
60
|
+
"""Specific types can apply their own cleaning routines here (this is called
|
|
61
|
+
by ``clean`` after the value has been converted to a string and null values
|
|
62
|
+
have been filtered)."""
|
|
63
|
+
if proxy is not None and text == proxy.id:
|
|
64
|
+
msg = gettext("Self-relationship (%s): %s")
|
|
65
|
+
raise InvalidData(msg % (proxy.schema, text))
|
|
66
|
+
if self.REGEX.match(text) is not None:
|
|
67
|
+
return text
|
|
68
|
+
return None
|
|
16
69
|
|
|
17
|
-
def
|
|
18
|
-
return
|
|
70
|
+
def rdf(self, value: str) -> Identifier:
|
|
71
|
+
return URIRef(f"entity:{value}")
|
|
19
72
|
|
|
20
|
-
def
|
|
21
|
-
return
|
|
73
|
+
def caption(self, value: str) -> None:
|
|
74
|
+
return None
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
from typing import Optional, TYPE_CHECKING
|
|
2
|
+
from babel.core import Locale
|
|
3
|
+
|
|
4
|
+
from followthemoney.types.common import EnumType, EnumValues
|
|
5
|
+
from followthemoney.rdf import URIRef, Identifier
|
|
6
|
+
from followthemoney.util import gettext, defer as _
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from followthemoney.proxy import EntityProxy
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class GenderType(EnumType):
|
|
13
|
+
"""A human gender. This is not meant to be a comprehensive model of
|
|
14
|
+
the social realities of gender but a way to capture data from (mostly)
|
|
15
|
+
government databases and represent it in a way that can be used by
|
|
16
|
+
structured tools. I'm not sure this justifies the simplification."""
|
|
17
|
+
|
|
18
|
+
MALE = "male"
|
|
19
|
+
FEMALE = "female"
|
|
20
|
+
OTHER = "other"
|
|
21
|
+
|
|
22
|
+
LOOKUP = {
|
|
23
|
+
"m": MALE,
|
|
24
|
+
"man": MALE,
|
|
25
|
+
"masculin": MALE,
|
|
26
|
+
"männlich": MALE,
|
|
27
|
+
"мужской": MALE,
|
|
28
|
+
"f": FEMALE,
|
|
29
|
+
"woman": FEMALE,
|
|
30
|
+
"féminin": FEMALE,
|
|
31
|
+
"weiblich": FEMALE,
|
|
32
|
+
"женский": FEMALE,
|
|
33
|
+
"o": OTHER,
|
|
34
|
+
"d": OTHER,
|
|
35
|
+
"divers": OTHER,
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
name = "gender"
|
|
39
|
+
group = "genders"
|
|
40
|
+
label = _("Gender")
|
|
41
|
+
plural = _("Genders")
|
|
42
|
+
matchable = False
|
|
43
|
+
max_length = 16
|
|
44
|
+
|
|
45
|
+
def _locale_names(self, locale: Locale) -> EnumValues:
|
|
46
|
+
return {
|
|
47
|
+
self.MALE: gettext("male"),
|
|
48
|
+
self.FEMALE: gettext("female"),
|
|
49
|
+
self.OTHER: gettext("other"),
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
def clean_text(
|
|
53
|
+
self,
|
|
54
|
+
text: str,
|
|
55
|
+
fuzzy: bool = False,
|
|
56
|
+
format: Optional[str] = None,
|
|
57
|
+
proxy: Optional["EntityProxy"] = None,
|
|
58
|
+
) -> Optional[str]:
|
|
59
|
+
code = text.lower().strip()
|
|
60
|
+
code = self.LOOKUP.get(code, code)
|
|
61
|
+
if code not in self.codes:
|
|
62
|
+
return None
|
|
63
|
+
return code
|
|
64
|
+
|
|
65
|
+
def rdf(self, value: str) -> Identifier:
|
|
66
|
+
return URIRef(f"gender:{value}")
|
followthemoney/types/iban.py
CHANGED
|
@@ -1,39 +1,58 @@
|
|
|
1
|
-
from
|
|
2
|
-
from
|
|
3
|
-
from schwifty import IBAN
|
|
1
|
+
from typing import Optional, TYPE_CHECKING
|
|
2
|
+
from rigour.ids import IBAN
|
|
4
3
|
|
|
5
4
|
from followthemoney.types.common import PropertyType
|
|
5
|
+
from followthemoney.rdf import URIRef, Identifier
|
|
6
|
+
from followthemoney.util import sanitize_text, defer as _
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from followthemoney.proxy import EntityProxy
|
|
6
10
|
|
|
7
11
|
|
|
8
12
|
class IbanType(PropertyType):
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
13
|
+
"""An international bank account number, as defined in ISO 13616. IBANs are
|
|
14
|
+
managed by SWIFT used in the European SEPA payment system.
|
|
15
|
+
|
|
16
|
+
A notable aspect of IBANs is that they share a country prefix and validation
|
|
17
|
+
mechanism, but the specific length of an IBAN is dependent on the country
|
|
18
|
+
code defined in the first two characters: `NO8330001234567` and
|
|
19
|
+
`CY21002001950000357001234567` are both valid values."""
|
|
20
|
+
|
|
21
|
+
name = "iban"
|
|
22
|
+
group = "ibans"
|
|
23
|
+
label = _("IBAN")
|
|
24
|
+
plural = _("IBANs")
|
|
25
|
+
matchable = True
|
|
26
|
+
pivot = True
|
|
27
|
+
max_length = 64
|
|
28
|
+
|
|
29
|
+
def validate(
|
|
30
|
+
self, value: str, fuzzy: bool = False, format: Optional[str] = None
|
|
31
|
+
) -> bool:
|
|
32
|
+
text = sanitize_text(value)
|
|
33
|
+
if text is None:
|
|
23
34
|
return False
|
|
24
|
-
|
|
25
|
-
|
|
35
|
+
return IBAN.is_valid(text)
|
|
36
|
+
|
|
37
|
+
def clean_text(
|
|
38
|
+
self,
|
|
39
|
+
text: str,
|
|
40
|
+
fuzzy: bool = False,
|
|
41
|
+
format: Optional[str] = None,
|
|
42
|
+
proxy: Optional["EntityProxy"] = None,
|
|
43
|
+
) -> Optional[str]:
|
|
26
44
|
"""Create a more clean, but still user-facing version of an
|
|
27
45
|
instance of the type."""
|
|
28
|
-
return
|
|
46
|
+
return IBAN.normalize(text)
|
|
47
|
+
|
|
48
|
+
def country_hint(self, value: str) -> str:
|
|
49
|
+
return value[:2].lower()
|
|
29
50
|
|
|
30
|
-
def
|
|
31
|
-
return
|
|
51
|
+
def rdf(self, value: str) -> Identifier:
|
|
52
|
+
return URIRef(self.node_id(value))
|
|
32
53
|
|
|
33
|
-
def
|
|
34
|
-
value
|
|
35
|
-
if value is not None:
|
|
36
|
-
return value[:2].lower()
|
|
54
|
+
def node_id(self, value: str) -> str:
|
|
55
|
+
return f"iban:{value.upper()}"
|
|
37
56
|
|
|
38
|
-
def
|
|
39
|
-
return
|
|
57
|
+
def caption(self, value: str) -> str:
|
|
58
|
+
return IBAN.format(value)
|
|
@@ -1,34 +1,61 @@
|
|
|
1
1
|
import re
|
|
2
|
-
from
|
|
2
|
+
from typing import Optional, TYPE_CHECKING
|
|
3
|
+
from rigour.ids import get_identifier_format_names, get_identifier_format
|
|
3
4
|
|
|
4
5
|
from followthemoney.types.common import PropertyType
|
|
6
|
+
from followthemoney.util import dampen, shortest, longest
|
|
7
|
+
from followthemoney.util import defer as _
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from followthemoney.proxy import EntityProxy
|
|
5
11
|
|
|
6
12
|
|
|
7
13
|
class IdentifierType(PropertyType):
|
|
8
|
-
"""Used for registration numbers
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
14
|
+
"""Used for registration numbers and other codes assigned by an authority
|
|
15
|
+
to identify an entity. This might include tax identifiers and statistical
|
|
16
|
+
codes.
|
|
17
|
+
|
|
18
|
+
Since identifiers are high-value criteria when comparing two entities, numbers
|
|
19
|
+
should only be modelled as identifiers if they are long enough to be meaningful.
|
|
20
|
+
Four- or five-digit industry classifiers create more noise than value."""
|
|
21
|
+
|
|
22
|
+
COMPARE_CLEAN = re.compile(r"[\W_]+")
|
|
23
|
+
name = "identifier"
|
|
24
|
+
group = "identifiers"
|
|
25
|
+
label = _("Identifier")
|
|
26
|
+
plural = _("Identifiers")
|
|
27
|
+
matchable = True
|
|
28
|
+
pivot = True
|
|
29
|
+
max_length = 64
|
|
30
|
+
|
|
31
|
+
def clean_text(
|
|
32
|
+
self,
|
|
33
|
+
text: str,
|
|
34
|
+
fuzzy: bool = False,
|
|
35
|
+
format: Optional[str] = None,
|
|
36
|
+
proxy: Optional["EntityProxy"] = None,
|
|
37
|
+
) -> Optional[str]:
|
|
38
|
+
if format in get_identifier_format_names():
|
|
39
|
+
format_ = get_identifier_format(format)
|
|
40
|
+
return format_.normalize(text)
|
|
41
|
+
return text
|
|
42
|
+
|
|
43
|
+
def clean_compare(self, value: str) -> str:
|
|
21
44
|
# TODO: should this be used for normalization?
|
|
22
|
-
value = self.COMPARE_CLEAN.sub(
|
|
45
|
+
value = self.COMPARE_CLEAN.sub("", value)
|
|
23
46
|
return value.lower()
|
|
24
47
|
|
|
25
|
-
def compare(self, left, right):
|
|
48
|
+
def compare(self, left: str, right: str) -> float:
|
|
26
49
|
left = self.clean_compare(left)
|
|
27
50
|
right = self.clean_compare(right)
|
|
28
51
|
if left == right:
|
|
29
|
-
return .
|
|
30
|
-
|
|
31
|
-
return
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
52
|
+
return 1.0
|
|
53
|
+
elif left in right or right in left:
|
|
54
|
+
return len(shortest(left, right)) / len(longest(left, right))
|
|
55
|
+
return 0.0
|
|
56
|
+
|
|
57
|
+
def _specificity(self, value: str) -> float:
|
|
58
|
+
return dampen(4, 10, value)
|
|
59
|
+
|
|
60
|
+
def node_id(self, value: str) -> str:
|
|
61
|
+
return f"id:{value}"
|