followthemoney 1.3.6__py3-none-any.whl → 3.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- followthemoney/__init__.py +5 -3
- followthemoney/cli/__init__.py +17 -0
- followthemoney/cli/aggregate.py +56 -0
- followthemoney/cli/cli.py +88 -0
- followthemoney/cli/exports.py +121 -0
- followthemoney/cli/mapping.py +85 -0
- followthemoney/cli/sieve.py +67 -0
- followthemoney/cli/util.py +142 -0
- followthemoney/compare.py +132 -55
- followthemoney/exc.py +19 -6
- followthemoney/export/common.py +29 -0
- followthemoney/export/csv.py +82 -0
- followthemoney/export/excel.py +75 -0
- followthemoney/export/graph.py +79 -0
- followthemoney/export/neo4j.py +182 -0
- followthemoney/export/rdf.py +26 -0
- followthemoney/graph.py +308 -0
- followthemoney/helpers.py +212 -0
- followthemoney/mapping/__init__.py +1 -1
- followthemoney/mapping/csv.py +67 -35
- followthemoney/mapping/entity.py +116 -44
- followthemoney/mapping/property.py +90 -44
- followthemoney/mapping/query.py +27 -19
- followthemoney/mapping/source.py +15 -5
- followthemoney/mapping/sql.py +75 -61
- followthemoney/messages.py +13 -7
- followthemoney/model.py +108 -56
- followthemoney/namespace.py +119 -0
- followthemoney/offshore.py +48 -0
- followthemoney/ontology.py +77 -0
- followthemoney/property.py +204 -71
- followthemoney/proxy.py +455 -118
- followthemoney/rdf.py +9 -0
- followthemoney/schema/Address.yaml +78 -0
- followthemoney/schema/Airplane.yaml +17 -10
- followthemoney/schema/Analyzable.yaml +54 -0
- followthemoney/schema/Article.yaml +16 -0
- followthemoney/schema/Assessment.yaml +32 -0
- followthemoney/schema/Asset.yaml +10 -4
- followthemoney/schema/Associate.yaml +41 -0
- followthemoney/schema/Audio.yaml +24 -0
- followthemoney/schema/BankAccount.yaml +53 -9
- followthemoney/schema/Call.yaml +48 -0
- followthemoney/schema/CallForTenders.yaml +117 -0
- followthemoney/schema/Company.yaml +37 -12
- followthemoney/schema/Contract.yaml +41 -7
- followthemoney/schema/ContractAward.yaml +30 -11
- followthemoney/schema/CourtCase.yaml +16 -10
- followthemoney/schema/CourtCaseParty.yaml +17 -6
- followthemoney/schema/CryptoWallet.yaml +48 -0
- followthemoney/schema/Debt.yaml +37 -0
- followthemoney/schema/Directorship.yaml +17 -4
- followthemoney/schema/Document.yaml +72 -139
- followthemoney/schema/Documentation.yml +38 -0
- followthemoney/schema/EconomicActivity.yaml +32 -17
- followthemoney/schema/Email.yaml +76 -0
- followthemoney/schema/Employment.yaml +39 -0
- followthemoney/schema/Event.yaml +35 -3
- followthemoney/schema/Family.yaml +41 -0
- followthemoney/schema/Folder.yaml +13 -0
- followthemoney/schema/HyperText.yaml +21 -0
- followthemoney/schema/Identification.yaml +40 -0
- followthemoney/schema/Image.yaml +25 -0
- followthemoney/schema/Interest.yaml +3 -6
- followthemoney/schema/Interval.yaml +56 -5
- followthemoney/schema/LegalEntity.yaml +81 -20
- followthemoney/schema/License.yaml +7 -3
- followthemoney/schema/Membership.yaml +19 -4
- followthemoney/schema/Mention.yaml +54 -0
- followthemoney/schema/Message.yaml +73 -0
- followthemoney/schema/Note.yaml +23 -0
- followthemoney/schema/Occupancy.yaml +40 -0
- followthemoney/schema/Organization.yaml +38 -3
- followthemoney/schema/Ownership.yaml +16 -4
- followthemoney/schema/Package.yaml +17 -0
- followthemoney/schema/Page.yaml +43 -0
- followthemoney/schema/Pages.yaml +23 -0
- followthemoney/schema/Passport.yaml +15 -17
- followthemoney/schema/Payment.yaml +38 -7
- followthemoney/schema/Person.yaml +61 -5
- followthemoney/schema/PlainText.yaml +17 -0
- followthemoney/schema/Position.yaml +50 -0
- followthemoney/schema/Post.yaml +42 -0
- followthemoney/schema/Project.yaml +27 -0
- followthemoney/schema/ProjectParticipant.yaml +36 -0
- followthemoney/schema/PublicBody.yaml +14 -3
- followthemoney/schema/RealEstate.yaml +19 -3
- followthemoney/schema/Representation.yaml +17 -6
- followthemoney/schema/Sanction.yaml +44 -20
- followthemoney/schema/Security.yaml +59 -0
- followthemoney/schema/Similar.yaml +37 -0
- followthemoney/schema/Succession.yaml +36 -0
- followthemoney/schema/Table.yaml +32 -0
- followthemoney/schema/TaxRoll.yaml +27 -9
- followthemoney/schema/Thing.yaml +69 -13
- followthemoney/schema/Trip.yaml +42 -0
- followthemoney/schema/UnknownLink.yaml +17 -6
- followthemoney/schema/UserAccount.yaml +44 -0
- followthemoney/schema/Value.yaml +5 -1
- followthemoney/schema/Vehicle.yaml +25 -8
- followthemoney/schema/Vessel.yaml +18 -10
- followthemoney/schema/Video.yaml +20 -0
- followthemoney/schema/Workbook.yaml +18 -0
- followthemoney/schema.py +406 -135
- followthemoney/translations/ar/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/ar/LC_MESSAGES/followthemoney.po +2900 -787
- followthemoney/translations/bs/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/bs/LC_MESSAGES/followthemoney.po +2108 -520
- followthemoney/translations/de/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/de/LC_MESSAGES/followthemoney.po +2902 -782
- followthemoney/translations/es/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/es/LC_MESSAGES/followthemoney.po +2893 -779
- followthemoney/translations/fr/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/fr/LC_MESSAGES/followthemoney.po +4362 -0
- followthemoney/translations/fr/followthemoney.po +3861 -0
- followthemoney/translations/messages.pot +3021 -725
- followthemoney/translations/nb/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/nb/LC_MESSAGES/followthemoney.po +3778 -0
- followthemoney/translations/nl/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/nl/LC_MESSAGES/followthemoney.po +3837 -0
- followthemoney/translations/pt_BR/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/pt_BR/LC_MESSAGES/followthemoney.po +3784 -0
- followthemoney/translations/ru/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/ru/LC_MESSAGES/followthemoney.po +2837 -539
- followthemoney/translations/ru/followthemoney.po +4221 -0
- followthemoney/translations/tr/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/tr/LC_MESSAGES/followthemoney.po +2073 -491
- followthemoney/types/__init__.py +35 -17
- followthemoney/types/address.py +41 -21
- followthemoney/types/checksum.py +25 -0
- followthemoney/types/common.py +233 -88
- followthemoney/types/country.py +89 -56
- followthemoney/types/date.py +59 -76
- followthemoney/types/email.py +66 -35
- followthemoney/types/entity.py +66 -13
- followthemoney/types/gender.py +66 -0
- followthemoney/types/iban.py +47 -28
- followthemoney/types/identifier.py +49 -22
- followthemoney/types/ip.py +35 -21
- followthemoney/types/json.py +58 -0
- followthemoney/types/language.py +124 -37
- followthemoney/types/mimetype.py +44 -0
- followthemoney/types/name.py +56 -12
- followthemoney/types/number.py +30 -0
- followthemoney/types/phone.py +92 -34
- followthemoney/types/registry.py +52 -0
- followthemoney/types/string.py +43 -0
- followthemoney/types/topic.py +94 -0
- followthemoney/types/url.py +39 -17
- followthemoney/util.py +139 -45
- followthemoney-3.8.0.dist-info/METADATA +153 -0
- followthemoney-3.8.0.dist-info/RECORD +157 -0
- {followthemoney-1.3.6.dist-info → followthemoney-3.8.0.dist-info}/WHEEL +1 -2
- followthemoney-3.8.0.dist-info/entry_points.txt +17 -0
- followthemoney-1.3.6.dist-info/LICENSE.txt → followthemoney-3.8.0.dist-info/licenses/LICENSE +1 -1
- followthemoney/link.py +0 -75
- followthemoney/schema/Associate.yml +0 -19
- followthemoney/schema/Family.yml +0 -19
- followthemoney/schema/Land.yml +0 -9
- followthemoney/schema/Relationship.yaml +0 -26
- followthemoney/types/domain.py +0 -50
- followthemoney-1.3.6.dist-info/DESCRIPTION.rst +0 -3
- followthemoney-1.3.6.dist-info/METADATA +0 -39
- followthemoney-1.3.6.dist-info/RECORD +0 -108
- followthemoney-1.3.6.dist-info/entry_points.txt +0 -3
- followthemoney-1.3.6.dist-info/metadata.json +0 -1
- followthemoney-1.3.6.dist-info/namespace_packages.txt +0 -1
- followthemoney-1.3.6.dist-info/top_level.txt +0 -3
- ns/ontology.py +0 -128
- tests/types/test_addresses.py +0 -24
- tests/types/test_common.py +0 -27
- tests/types/test_countries.py +0 -21
- tests/types/test_dates.py +0 -72
- tests/types/test_domains.py +0 -23
- tests/types/test_emails.py +0 -30
- tests/types/test_entity.py +0 -16
- tests/types/test_iban.py +0 -109
- tests/types/test_identifiers.py +0 -25
- tests/types/test_ip.py +0 -26
- tests/types/test_languages.py +0 -20
- tests/types/test_names.py +0 -33
- tests/types/test_phones.py +0 -24
- tests/types/test_registry.py +0 -14
- tests/types/test_urls.py +0 -23
- {ns → followthemoney/export}/__init__.py +0 -0
- /tests/types/__init__.py → /followthemoney/py.typed +0 -0
followthemoney/types/phone.py
CHANGED
|
@@ -1,31 +1,54 @@
|
|
|
1
|
-
from
|
|
2
|
-
from banal import ensure_list
|
|
3
|
-
from phonenumbers import geocoder
|
|
1
|
+
from typing import Iterable, Optional, TYPE_CHECKING
|
|
4
2
|
from phonenumbers import parse as parse_number
|
|
5
|
-
from phonenumbers import
|
|
6
|
-
from phonenumbers import PhoneNumberFormat
|
|
7
|
-
from phonenumbers.phonenumberutil import NumberParseException
|
|
3
|
+
from phonenumbers import is_valid_number, format_number
|
|
4
|
+
from phonenumbers import PhoneNumber, PhoneNumberFormat
|
|
5
|
+
from phonenumbers.phonenumberutil import region_code_for_number, NumberParseException
|
|
8
6
|
|
|
9
7
|
from followthemoney.types.common import PropertyType
|
|
8
|
+
from followthemoney.rdf import URIRef, Identifier
|
|
9
|
+
from followthemoney.util import defer as _
|
|
10
|
+
from followthemoney.util import dampen
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from followthemoney.proxy import EntityProxy
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
# TODO: for json schema export
|
|
17
|
+
# https://stackoverflow.com/questions/6478875/regular-expression-matching-e-164-formatted-phone-numbers
|
|
10
18
|
|
|
11
19
|
|
|
12
20
|
class PhoneType(PropertyType):
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
21
|
+
"""A phone number in E.164 format. This means that phone numbers always
|
|
22
|
+
include an international country prefix (e.g. `+38760183628`). The
|
|
23
|
+
cleaning and validation functions for this try to be smart about by
|
|
24
|
+
accepting a list of countries as an argument in order to add the number
|
|
25
|
+
prefix.
|
|
26
|
+
|
|
27
|
+
When adding a property of this type to an entity, any country-type properties
|
|
28
|
+
defined for the entity are considered for validation. That means that adding a
|
|
29
|
+
phone number to an entity before adding a country can have a different
|
|
30
|
+
validation outcome from doing the two operations the other way around. Always
|
|
31
|
+
define the country first."""
|
|
32
|
+
|
|
33
|
+
name = "phone"
|
|
34
|
+
group = "phones"
|
|
35
|
+
label = _("Phone number")
|
|
36
|
+
plural = _("Phone numbers")
|
|
37
|
+
matchable = True
|
|
38
|
+
pivot = True
|
|
39
|
+
max_length = 64
|
|
40
|
+
|
|
41
|
+
def _clean_countries(
|
|
42
|
+
self, proxy: Optional["EntityProxy"]
|
|
43
|
+
) -> Iterable[Optional[str]]:
|
|
44
|
+
yield None
|
|
45
|
+
if proxy is not None:
|
|
46
|
+
for country in proxy.countries:
|
|
47
|
+
yield country.upper()
|
|
48
|
+
|
|
49
|
+
def _parse_number(
|
|
50
|
+
self, number: str, proxy: Optional["EntityProxy"] = None
|
|
51
|
+
) -> Iterable[PhoneNumber]:
|
|
29
52
|
"""Parse a phone number and return in international format.
|
|
30
53
|
|
|
31
54
|
If no valid phone number can be detected, None is returned. If
|
|
@@ -34,24 +57,59 @@ class PhoneType(PropertyType):
|
|
|
34
57
|
|
|
35
58
|
https://github.com/daviddrysdale/python-phonenumbers
|
|
36
59
|
"""
|
|
37
|
-
for code in self._clean_countries(
|
|
60
|
+
for code in self._clean_countries(proxy):
|
|
38
61
|
try:
|
|
39
|
-
|
|
40
|
-
if is_possible_number(num):
|
|
41
|
-
if is_valid_number(num):
|
|
42
|
-
return format_number(num, PhoneNumberFormat.E164)
|
|
62
|
+
yield parse_number(number, code)
|
|
43
63
|
except NumberParseException:
|
|
44
64
|
pass
|
|
45
65
|
|
|
46
|
-
def
|
|
47
|
-
|
|
66
|
+
def validate(
|
|
67
|
+
self, value: str, fuzzy: bool = False, format: Optional[str] = None
|
|
68
|
+
) -> bool:
|
|
69
|
+
for num in self._parse_number(value):
|
|
70
|
+
if is_valid_number(num):
|
|
71
|
+
return True
|
|
72
|
+
return False
|
|
73
|
+
|
|
74
|
+
def clean_text(
|
|
75
|
+
self,
|
|
76
|
+
text: str,
|
|
77
|
+
fuzzy: bool = False,
|
|
78
|
+
format: Optional[str] = None,
|
|
79
|
+
proxy: Optional["EntityProxy"] = None,
|
|
80
|
+
) -> Optional[str]:
|
|
81
|
+
for num in self._parse_number(text, proxy=proxy):
|
|
82
|
+
if is_valid_number(num):
|
|
83
|
+
return str(format_number(num, PhoneNumberFormat.E164))
|
|
84
|
+
return None
|
|
48
85
|
|
|
49
|
-
def country_hint(self, value):
|
|
86
|
+
def country_hint(self, value: str) -> Optional[str]:
|
|
50
87
|
try:
|
|
51
88
|
number = parse_number(value)
|
|
52
|
-
|
|
89
|
+
code = region_code_for_number(number)
|
|
90
|
+
if code is None:
|
|
91
|
+
return None
|
|
92
|
+
return str(code).lower()
|
|
53
93
|
except NumberParseException:
|
|
54
|
-
|
|
94
|
+
return None
|
|
95
|
+
|
|
96
|
+
def _specificity(self, value: str) -> float:
|
|
97
|
+
# TODO: insert artificial intelligence here.
|
|
98
|
+
return dampen(7, 11, value)
|
|
99
|
+
|
|
100
|
+
def rdf(self, value: str) -> Identifier:
|
|
101
|
+
node_id = self.node_id(value)
|
|
102
|
+
if node_id is not None:
|
|
103
|
+
return URIRef(node_id)
|
|
104
|
+
raise ValueError("Invalid phone number for serialisation: %s" % value)
|
|
105
|
+
|
|
106
|
+
def node_id(self, value: str) -> Optional[str]:
|
|
107
|
+
return f"tel:{value}"
|
|
55
108
|
|
|
56
|
-
def
|
|
57
|
-
|
|
109
|
+
def caption(self, value: str) -> str:
|
|
110
|
+
try:
|
|
111
|
+
number = parse_number(value)
|
|
112
|
+
formatted = format_number(number, PhoneNumberFormat.INTERNATIONAL)
|
|
113
|
+
return str(formatted)
|
|
114
|
+
except NumberParseException:
|
|
115
|
+
return value
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
from banal import ensure_list
|
|
2
|
+
from typing import Iterable, Set, Dict, Type, Union, List, Optional
|
|
3
|
+
|
|
4
|
+
from followthemoney.types.common import PropertyType
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class Registry(object):
|
|
8
|
+
"""This registry keeps the processing helpers for all property types
|
|
9
|
+
in the system. They are instantiated as singletons when the system is first
|
|
10
|
+
loaded. The registry can be used to get a type, which can itself then
|
|
11
|
+
clean, validate or format values of that type."""
|
|
12
|
+
|
|
13
|
+
def __init__(self) -> None:
|
|
14
|
+
self.named: Dict[str, PropertyType] = {}
|
|
15
|
+
self.matchable: Set[PropertyType] = set()
|
|
16
|
+
self.types: Set[PropertyType] = set()
|
|
17
|
+
self.groups: Dict[str, PropertyType] = {}
|
|
18
|
+
self.pivots: Set[PropertyType] = set()
|
|
19
|
+
|
|
20
|
+
def add(self, clazz: Type[PropertyType]) -> None:
|
|
21
|
+
"""Add a singleton class."""
|
|
22
|
+
type_ = clazz()
|
|
23
|
+
self.named[clazz.name] = type_
|
|
24
|
+
self.types.add(type_)
|
|
25
|
+
if type_.matchable:
|
|
26
|
+
self.matchable.add(type_)
|
|
27
|
+
if type_.pivot:
|
|
28
|
+
self.pivots.add(type_)
|
|
29
|
+
if type_.group is not None:
|
|
30
|
+
self.groups[type_.group] = type_
|
|
31
|
+
|
|
32
|
+
def get(self, name: Union[str, PropertyType]) -> Optional[PropertyType]:
|
|
33
|
+
"""For a given property type name, get its type object. This can also
|
|
34
|
+
be used via getattr, e.g. ``registry.phone``."""
|
|
35
|
+
# Allow transparent re-checking.
|
|
36
|
+
if isinstance(name, PropertyType):
|
|
37
|
+
return name
|
|
38
|
+
return self.named.get(name)
|
|
39
|
+
|
|
40
|
+
def get_types(
|
|
41
|
+
self, names: Iterable[Union[str, PropertyType]]
|
|
42
|
+
) -> List[PropertyType]:
|
|
43
|
+
"""Get a list of all type names."""
|
|
44
|
+
names = ensure_list(names)
|
|
45
|
+
types = [self.get(n) for n in names]
|
|
46
|
+
return [t for t in types if t is not None]
|
|
47
|
+
|
|
48
|
+
def __getitem__(self, name: str) -> PropertyType:
|
|
49
|
+
return self.named[name]
|
|
50
|
+
|
|
51
|
+
def __getattr__(self, name: str) -> PropertyType:
|
|
52
|
+
return self.named[name]
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
from followthemoney.types.common import PropertyType
|
|
2
|
+
from followthemoney.util import defer as _
|
|
3
|
+
from followthemoney.util import MEGABYTE
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class StringType(PropertyType):
|
|
7
|
+
"""A simple string property with no additional semantics."""
|
|
8
|
+
|
|
9
|
+
name = "string"
|
|
10
|
+
label = _("Label")
|
|
11
|
+
plural = _("Labels")
|
|
12
|
+
matchable = False
|
|
13
|
+
max_length = 1024
|
|
14
|
+
|
|
15
|
+
def node_id(self, value: str) -> None:
|
|
16
|
+
return None
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class TextType(StringType):
|
|
20
|
+
"""Longer text fragments, such as descriptions or document text. Unlike
|
|
21
|
+
string properties, it might make sense to treat properties of this type as
|
|
22
|
+
full-text search material."""
|
|
23
|
+
|
|
24
|
+
name = "text"
|
|
25
|
+
label = _("Text")
|
|
26
|
+
plural = _("Texts")
|
|
27
|
+
total_size = 30 * MEGABYTE
|
|
28
|
+
max_length = 65000
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class HTMLType(StringType):
|
|
32
|
+
"""Properties that contain raw hypertext markup (HTML).
|
|
33
|
+
|
|
34
|
+
User interfaces rendering properties of this type need to take extreme
|
|
35
|
+
care not to allow attacks such as cross-site scripting. It is recommended
|
|
36
|
+
to perform server-side sanitisation, or to not render this property at all.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
name = "html"
|
|
40
|
+
label = _("HTML")
|
|
41
|
+
plural = _("HTMLs")
|
|
42
|
+
total_size = 30 * MEGABYTE
|
|
43
|
+
max_length = 65000
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
from babel.core import Locale
|
|
2
|
+
|
|
3
|
+
from followthemoney.types.common import EnumType, EnumValues
|
|
4
|
+
from followthemoney.rdf import URIRef, Identifier
|
|
5
|
+
from followthemoney.util import gettext, defer as _
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class TopicType(EnumType):
|
|
9
|
+
"""Topics define a controlled vocabulary of terms applicable to some
|
|
10
|
+
entities, such as companies and people. They describe categories of
|
|
11
|
+
journalistic interest which may apply to the given entity, for example
|
|
12
|
+
if a given person is a criminal or a politician.
|
|
13
|
+
|
|
14
|
+
Besides the informative value, topics are ultimately supposed to bear
|
|
15
|
+
fruits in the context of graph-based data analysis, where they would
|
|
16
|
+
enable queries such as _find all paths between a government procurement
|
|
17
|
+
award and a politician_."""
|
|
18
|
+
|
|
19
|
+
name = "topic"
|
|
20
|
+
group = "topics"
|
|
21
|
+
label = _("Topic")
|
|
22
|
+
plural = _("Topics")
|
|
23
|
+
matchable = False
|
|
24
|
+
max_length = 64
|
|
25
|
+
|
|
26
|
+
_TOPICS = {
|
|
27
|
+
"crime": _("Crime"),
|
|
28
|
+
"crime.fraud": _("Fraud"),
|
|
29
|
+
"crime.cyber": _("Cybercrime"),
|
|
30
|
+
"crime.fin": _("Financial crime"),
|
|
31
|
+
"crime.env": _("Environmental violations"),
|
|
32
|
+
"crime.theft": _("Theft"),
|
|
33
|
+
"crime.war": _("War crimes"),
|
|
34
|
+
"crime.boss": _("Criminal leadership"),
|
|
35
|
+
"crime.terror": _("Terrorism"),
|
|
36
|
+
"crime.traffick": _("Trafficking"),
|
|
37
|
+
"crime.traffick.drug": _("Drug trafficking"),
|
|
38
|
+
"crime.traffick.human": _("Human trafficking"),
|
|
39
|
+
"wanted": _("Wanted"),
|
|
40
|
+
"corp.offshore": _("Offshore"),
|
|
41
|
+
"corp.shell": _("Shell company"),
|
|
42
|
+
"corp.public": _("Public listed company"),
|
|
43
|
+
"corp.disqual": _("Disqualified"),
|
|
44
|
+
"gov": _("Government"),
|
|
45
|
+
"gov.national": _("National government"),
|
|
46
|
+
"gov.state": _("State government"),
|
|
47
|
+
"gov.muni": _("Municipal government"),
|
|
48
|
+
"gov.soe": _("State-owned enterprise"),
|
|
49
|
+
"gov.igo": _("Intergovernmental organization"),
|
|
50
|
+
"gov.head": _("Head of government or state"),
|
|
51
|
+
"gov.admin": _("Civil service"),
|
|
52
|
+
"gov.executive": _("Executive branch of government"),
|
|
53
|
+
"gov.legislative": _("Legislative branch of government"),
|
|
54
|
+
"gov.judicial": _("Judicial branch of government"),
|
|
55
|
+
"gov.security": _("Security services"),
|
|
56
|
+
"gov.financial": _("Central banking and financial integrity"),
|
|
57
|
+
"fin": _("Financial services"),
|
|
58
|
+
"fin.bank": _("Bank"),
|
|
59
|
+
"fin.fund": _("Fund"),
|
|
60
|
+
"fin.adivsor": _("Financial advisor"),
|
|
61
|
+
"reg.action": _("Regulator action"),
|
|
62
|
+
"reg.warn": _("Regulator warning"),
|
|
63
|
+
"role.pep": _("Politician"),
|
|
64
|
+
"role.pol": _("Non-PEP"),
|
|
65
|
+
"role.rca": _("Close Associate"),
|
|
66
|
+
"role.judge": _("Judge"),
|
|
67
|
+
"role.civil": _("Civil servant"),
|
|
68
|
+
"role.diplo": _("Diplomat"),
|
|
69
|
+
"role.lawyer": _("Lawyer"),
|
|
70
|
+
"role.acct": _("Accountant"),
|
|
71
|
+
"role.spy": _("Spy"),
|
|
72
|
+
"role.oligarch": _("Oligarch"),
|
|
73
|
+
"role.journo": _("Journalist"),
|
|
74
|
+
"role.act": _("Activist"),
|
|
75
|
+
"role.lobby": _("Lobbyist"),
|
|
76
|
+
"pol.party": _("Political party"),
|
|
77
|
+
"pol.union": _("Union"),
|
|
78
|
+
"rel": _("Religion"),
|
|
79
|
+
"mil": _("Military"),
|
|
80
|
+
"asset.frozen": _("Frozen asset"),
|
|
81
|
+
"sanction": _("Sanctioned entity"),
|
|
82
|
+
"sanction.linked": _("Sanction-linked entity"),
|
|
83
|
+
"sanction.counter": _("Counter-sanctioned entity"),
|
|
84
|
+
"export.control": _("Export controlled"),
|
|
85
|
+
"export.risk": _("Trade risk"),
|
|
86
|
+
"debarment": _("Debarred entity"),
|
|
87
|
+
"poi": _("Person of interest"),
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
def _locale_names(self, locale: Locale) -> EnumValues:
|
|
91
|
+
return {k: gettext(v) for (k, v) in self._TOPICS.items()}
|
|
92
|
+
|
|
93
|
+
def rdf(self, value: str) -> Identifier:
|
|
94
|
+
return URIRef(f"ftm:topic:{value}")
|
followthemoney/types/url.py
CHANGED
|
@@ -1,27 +1,49 @@
|
|
|
1
|
-
from
|
|
2
|
-
from
|
|
1
|
+
from typing import Optional, TYPE_CHECKING
|
|
2
|
+
from rigour.urls import clean_url, compare_urls
|
|
3
3
|
|
|
4
4
|
from followthemoney.types.common import PropertyType
|
|
5
|
+
from followthemoney.rdf import URIRef, Identifier
|
|
6
|
+
from followthemoney.util import dampen, defer as _
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from followthemoney.proxy import EntityProxy
|
|
5
10
|
|
|
6
11
|
|
|
7
12
|
class UrlType(PropertyType):
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
13
|
+
"""A uniform resource locator (URL). This will perform some normalisation
|
|
14
|
+
on the URL so that it's sure to be using valid encoding/quoting, and to
|
|
15
|
+
make sure the URL has a schema (e.g. `http`, `https`, ...)."""
|
|
16
|
+
|
|
17
|
+
SCHEMES = ("http", "https", "ftp", "mailto")
|
|
18
|
+
DEFAULT_SCHEME = "http"
|
|
11
19
|
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
20
|
+
name = "url"
|
|
21
|
+
group = "urls"
|
|
22
|
+
label = _("URL")
|
|
23
|
+
plural = _("URLs")
|
|
24
|
+
matchable = True
|
|
25
|
+
pivot = True
|
|
26
|
+
max_length = 4096
|
|
15
27
|
|
|
16
|
-
def clean_text(
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
28
|
+
def clean_text(
|
|
29
|
+
self,
|
|
30
|
+
text: str,
|
|
31
|
+
fuzzy: bool = False,
|
|
32
|
+
format: Optional[str] = None,
|
|
33
|
+
proxy: Optional["EntityProxy"] = None,
|
|
34
|
+
) -> Optional[str]:
|
|
35
|
+
"""Perform intensive care on URLs to make sure they have a scheme
|
|
36
|
+
and a host name. If no scheme is given HTTP is assumed."""
|
|
37
|
+
return clean_url(text)
|
|
22
38
|
|
|
23
|
-
def
|
|
24
|
-
return
|
|
39
|
+
def compare(self, left: str, right: str) -> float:
|
|
40
|
+
return compare_urls(left, right)
|
|
25
41
|
|
|
26
|
-
def
|
|
42
|
+
def _specificity(self, value: str) -> float:
|
|
43
|
+
return dampen(10, 120, value)
|
|
44
|
+
|
|
45
|
+
def rdf(self, value: str) -> Identifier:
|
|
27
46
|
return URIRef(value)
|
|
47
|
+
|
|
48
|
+
def node_id(self, value: str) -> Optional[str]:
|
|
49
|
+
return f"url:{value}"
|
followthemoney/util.py
CHANGED
|
@@ -1,63 +1,157 @@
|
|
|
1
1
|
import os
|
|
2
|
-
|
|
3
|
-
from
|
|
2
|
+
import logging
|
|
3
|
+
from hashlib import sha1
|
|
4
4
|
from babel import Locale
|
|
5
5
|
from gettext import translation
|
|
6
|
-
from rdflib import Namespace
|
|
7
|
-
from banal import is_mapping, is_sequence
|
|
8
|
-
from banal import unique_list, ensure_list
|
|
9
6
|
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
7
|
+
from threading import local
|
|
8
|
+
from typing import cast, Dict, Any, List, Optional, TypeVar, Union, Sequence
|
|
9
|
+
from normality import stringify
|
|
10
|
+
from normality.cleaning import compose_nfc
|
|
11
|
+
from normality.cleaning import remove_unsafe_chars
|
|
12
|
+
from normality.encoding import DEFAULT_ENCODING
|
|
13
|
+
from banal import is_mapping, unique_list, ensure_list
|
|
14
|
+
|
|
15
|
+
MEGABYTE = 1024 * 1024
|
|
16
|
+
DEFAULT_LOCALE = "en"
|
|
17
|
+
ENTITY_ID_LEN = 200
|
|
18
|
+
|
|
19
|
+
T = TypeVar("T")
|
|
20
|
+
K = TypeVar("K")
|
|
21
|
+
V = TypeVar("V")
|
|
22
|
+
|
|
23
|
+
PathLike = Union[str, os.PathLike[str]]
|
|
24
|
+
i18n_path = os.path.join(os.path.dirname(__file__), "translations")
|
|
13
25
|
state = local()
|
|
26
|
+
log = logging.getLogger(__name__)
|
|
27
|
+
|
|
14
28
|
|
|
29
|
+
def gettext(*args: Optional[str], **kwargs: Dict[str, str]) -> str:
|
|
30
|
+
if not hasattr(state, "translation"):
|
|
31
|
+
set_model_locale(Locale.parse(DEFAULT_LOCALE))
|
|
32
|
+
return cast(str, state.translation.gettext(*args, **kwargs))
|
|
15
33
|
|
|
16
|
-
def gettext(*args, **kwargs):
|
|
17
|
-
if not hasattr(state, 'translation'):
|
|
18
|
-
set_model_locale(DEFAULT_LOCALE)
|
|
19
|
-
return state.translation.gettext(*args, **kwargs)
|
|
20
34
|
|
|
35
|
+
def defer(text: str) -> str:
|
|
36
|
+
return text
|
|
21
37
|
|
|
22
|
-
|
|
38
|
+
|
|
39
|
+
def set_model_locale(locale: Locale) -> None:
|
|
23
40
|
state.locale = locale
|
|
24
|
-
state.translation = translation(
|
|
25
|
-
|
|
41
|
+
state.translation = translation(
|
|
42
|
+
"followthemoney", i18n_path, [str(locale)], fallback=True
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def get_locale() -> Locale:
|
|
47
|
+
if not hasattr(state, "locale"):
|
|
48
|
+
return Locale.parse(DEFAULT_LOCALE)
|
|
49
|
+
return Locale.parse(state.locale)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def get_env_list(name: str, default: List[str] = []) -> List[str]:
|
|
53
|
+
value = stringify(os.environ.get(name))
|
|
54
|
+
if value is not None:
|
|
55
|
+
values = value.split(":")
|
|
56
|
+
if len(values):
|
|
57
|
+
return values
|
|
58
|
+
return default
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def sanitize_text(text: Any, encoding: str = DEFAULT_ENCODING) -> Optional[str]:
|
|
62
|
+
text = stringify(text, encoding_default=encoding)
|
|
63
|
+
if text is None:
|
|
64
|
+
return None
|
|
65
|
+
try:
|
|
66
|
+
text = compose_nfc(text)
|
|
67
|
+
except (SystemError, Exception) as ex:
|
|
68
|
+
log.warning("Cannot NFC text: %s", ex)
|
|
69
|
+
return None
|
|
70
|
+
text = remove_unsafe_chars(text)
|
|
71
|
+
if text is None:
|
|
72
|
+
return None
|
|
73
|
+
byte_text = text.encode(DEFAULT_ENCODING, "replace")
|
|
74
|
+
return cast(str, byte_text.decode(DEFAULT_ENCODING, "replace"))
|
|
26
75
|
|
|
27
76
|
|
|
28
|
-
def
|
|
29
|
-
if not
|
|
30
|
-
|
|
31
|
-
|
|
77
|
+
def value_list(value: Union[T, Sequence[T]]) -> List[T]:
|
|
78
|
+
if not isinstance(value, (str, bytes)):
|
|
79
|
+
try:
|
|
80
|
+
return [v for v in cast(Sequence[T], value)]
|
|
81
|
+
except TypeError:
|
|
82
|
+
pass
|
|
83
|
+
return [cast(T, value)]
|
|
32
84
|
|
|
33
85
|
|
|
34
|
-
def key_bytes(key):
|
|
86
|
+
def key_bytes(key: Any) -> bytes:
|
|
35
87
|
"""Convert the given data to a value appropriate for hashing."""
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
88
|
+
if isinstance(key, bytes):
|
|
89
|
+
return key
|
|
90
|
+
text = stringify(key)
|
|
91
|
+
if text is None:
|
|
92
|
+
return b""
|
|
93
|
+
return text.encode("utf-8")
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def join_text(*parts: Any, sep: str = " ") -> Optional[str]:
|
|
97
|
+
"""Join all the non-null arguments using sep."""
|
|
98
|
+
texts: List[str] = []
|
|
99
|
+
for part in parts:
|
|
100
|
+
text = stringify(part)
|
|
101
|
+
if text is not None:
|
|
102
|
+
texts.append(text)
|
|
103
|
+
if not len(texts):
|
|
104
|
+
return None
|
|
105
|
+
return sep.join(texts)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def get_entity_id(obj: Any) -> Optional[str]:
|
|
109
|
+
"""Given an entity-ish object, try to get the ID."""
|
|
110
|
+
if is_mapping(obj):
|
|
111
|
+
obj = obj.get("id")
|
|
112
|
+
else:
|
|
113
|
+
try:
|
|
114
|
+
obj = obj.id
|
|
115
|
+
except AttributeError:
|
|
116
|
+
pass
|
|
117
|
+
return stringify(obj)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def make_entity_id(*parts: Any, key_prefix: Optional[str] = None) -> Optional[str]:
|
|
121
|
+
digest = sha1()
|
|
122
|
+
if key_prefix:
|
|
123
|
+
digest.update(key_bytes(key_prefix))
|
|
124
|
+
base = digest.digest()
|
|
125
|
+
for part in parts:
|
|
126
|
+
digest.update(key_bytes(part))
|
|
127
|
+
if digest.digest() == base:
|
|
128
|
+
return None
|
|
129
|
+
return digest.hexdigest()
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def merge_context(left: Dict[K, V], right: Dict[K, V]) -> Dict[K, List[V]]:
|
|
133
|
+
"""When merging two entities, make lists of all the duplicate context
|
|
134
|
+
keys."""
|
|
135
|
+
combined = {}
|
|
136
|
+
keys = [*left.keys(), *right.keys()]
|
|
137
|
+
for key in set(keys):
|
|
138
|
+
if key in ("caption",):
|
|
139
|
+
continue
|
|
140
|
+
lval: List[V] = [i for i in ensure_list(left.get(key)) if i is not None]
|
|
141
|
+
rval: List[V] = [i for i in ensure_list(right.get(key)) if i is not None]
|
|
142
|
+
combined[key] = unique_list([*lval, *rval])
|
|
143
|
+
return combined
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def dampen(short: int, long: int, text: str) -> float:
|
|
61
147
|
length = len(text) - short
|
|
62
148
|
baseline = max(1.0, (long - short))
|
|
63
149
|
return max(0, min(1.0, (length / baseline)))
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def shortest(*texts: str) -> str:
|
|
153
|
+
return min(texts, key=len)
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def longest(*texts: str) -> str:
|
|
157
|
+
return max(texts, key=len)
|