followthemoney 1.3.7__py3-none-any.whl → 3.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (186) hide show
  1. followthemoney/__init__.py +5 -3
  2. followthemoney/cli/__init__.py +17 -0
  3. followthemoney/cli/aggregate.py +56 -0
  4. followthemoney/cli/cli.py +88 -0
  5. followthemoney/cli/exports.py +121 -0
  6. followthemoney/cli/mapping.py +85 -0
  7. followthemoney/cli/sieve.py +67 -0
  8. followthemoney/cli/util.py +142 -0
  9. followthemoney/compare.py +130 -60
  10. followthemoney/exc.py +19 -6
  11. followthemoney/export/common.py +29 -0
  12. followthemoney/export/csv.py +82 -0
  13. followthemoney/export/excel.py +75 -0
  14. followthemoney/export/graph.py +79 -0
  15. followthemoney/export/neo4j.py +182 -0
  16. followthemoney/export/rdf.py +26 -0
  17. followthemoney/graph.py +308 -0
  18. followthemoney/helpers.py +212 -0
  19. followthemoney/mapping/__init__.py +1 -1
  20. followthemoney/mapping/csv.py +67 -35
  21. followthemoney/mapping/entity.py +116 -44
  22. followthemoney/mapping/property.py +90 -44
  23. followthemoney/mapping/query.py +27 -19
  24. followthemoney/mapping/source.py +15 -5
  25. followthemoney/mapping/sql.py +75 -61
  26. followthemoney/messages.py +13 -7
  27. followthemoney/model.py +108 -56
  28. followthemoney/namespace.py +119 -0
  29. followthemoney/offshore.py +48 -0
  30. followthemoney/ontology.py +77 -0
  31. followthemoney/property.py +204 -71
  32. followthemoney/proxy.py +455 -118
  33. followthemoney/rdf.py +9 -0
  34. followthemoney/schema/Address.yaml +78 -0
  35. followthemoney/schema/Airplane.yaml +17 -10
  36. followthemoney/schema/Analyzable.yaml +54 -0
  37. followthemoney/schema/Article.yaml +16 -0
  38. followthemoney/schema/Assessment.yaml +32 -0
  39. followthemoney/schema/Asset.yaml +10 -4
  40. followthemoney/schema/Associate.yaml +41 -0
  41. followthemoney/schema/Audio.yaml +24 -0
  42. followthemoney/schema/BankAccount.yaml +53 -9
  43. followthemoney/schema/Call.yaml +48 -0
  44. followthemoney/schema/CallForTenders.yaml +117 -0
  45. followthemoney/schema/Company.yaml +37 -12
  46. followthemoney/schema/Contract.yaml +41 -7
  47. followthemoney/schema/ContractAward.yaml +30 -11
  48. followthemoney/schema/CourtCase.yaml +16 -10
  49. followthemoney/schema/CourtCaseParty.yaml +17 -6
  50. followthemoney/schema/CryptoWallet.yaml +48 -0
  51. followthemoney/schema/Debt.yaml +37 -0
  52. followthemoney/schema/Directorship.yaml +17 -4
  53. followthemoney/schema/Document.yaml +72 -139
  54. followthemoney/schema/Documentation.yml +38 -0
  55. followthemoney/schema/EconomicActivity.yaml +32 -17
  56. followthemoney/schema/Email.yaml +76 -0
  57. followthemoney/schema/Employment.yaml +39 -0
  58. followthemoney/schema/Event.yaml +35 -3
  59. followthemoney/schema/Family.yaml +41 -0
  60. followthemoney/schema/Folder.yaml +13 -0
  61. followthemoney/schema/HyperText.yaml +21 -0
  62. followthemoney/schema/Identification.yaml +40 -0
  63. followthemoney/schema/Image.yaml +25 -0
  64. followthemoney/schema/Interest.yaml +3 -6
  65. followthemoney/schema/Interval.yaml +56 -5
  66. followthemoney/schema/LegalEntity.yaml +81 -20
  67. followthemoney/schema/License.yaml +7 -3
  68. followthemoney/schema/Membership.yaml +19 -4
  69. followthemoney/schema/Mention.yaml +54 -0
  70. followthemoney/schema/Message.yaml +73 -0
  71. followthemoney/schema/Note.yaml +23 -0
  72. followthemoney/schema/Occupancy.yaml +40 -0
  73. followthemoney/schema/Organization.yaml +38 -3
  74. followthemoney/schema/Ownership.yaml +16 -4
  75. followthemoney/schema/Package.yaml +17 -0
  76. followthemoney/schema/Page.yaml +43 -0
  77. followthemoney/schema/Pages.yaml +23 -0
  78. followthemoney/schema/Passport.yaml +15 -17
  79. followthemoney/schema/Payment.yaml +38 -7
  80. followthemoney/schema/Person.yaml +61 -5
  81. followthemoney/schema/PlainText.yaml +17 -0
  82. followthemoney/schema/Position.yaml +50 -0
  83. followthemoney/schema/Post.yaml +42 -0
  84. followthemoney/schema/Project.yaml +27 -0
  85. followthemoney/schema/ProjectParticipant.yaml +36 -0
  86. followthemoney/schema/PublicBody.yaml +14 -3
  87. followthemoney/schema/RealEstate.yaml +19 -3
  88. followthemoney/schema/Representation.yaml +17 -6
  89. followthemoney/schema/Sanction.yaml +44 -20
  90. followthemoney/schema/Security.yaml +59 -0
  91. followthemoney/schema/Similar.yaml +37 -0
  92. followthemoney/schema/Succession.yaml +36 -0
  93. followthemoney/schema/Table.yaml +32 -0
  94. followthemoney/schema/TaxRoll.yaml +27 -9
  95. followthemoney/schema/Thing.yaml +69 -13
  96. followthemoney/schema/Trip.yaml +42 -0
  97. followthemoney/schema/UnknownLink.yaml +17 -6
  98. followthemoney/schema/UserAccount.yaml +44 -0
  99. followthemoney/schema/Value.yaml +5 -1
  100. followthemoney/schema/Vehicle.yaml +25 -8
  101. followthemoney/schema/Vessel.yaml +18 -10
  102. followthemoney/schema/Video.yaml +20 -0
  103. followthemoney/schema/Workbook.yaml +18 -0
  104. followthemoney/schema.py +406 -135
  105. followthemoney/translations/ar/LC_MESSAGES/followthemoney.mo +0 -0
  106. followthemoney/translations/ar/LC_MESSAGES/followthemoney.po +2900 -787
  107. followthemoney/translations/bs/LC_MESSAGES/followthemoney.mo +0 -0
  108. followthemoney/translations/bs/LC_MESSAGES/followthemoney.po +2108 -520
  109. followthemoney/translations/de/LC_MESSAGES/followthemoney.mo +0 -0
  110. followthemoney/translations/de/LC_MESSAGES/followthemoney.po +2902 -782
  111. followthemoney/translations/es/LC_MESSAGES/followthemoney.mo +0 -0
  112. followthemoney/translations/es/LC_MESSAGES/followthemoney.po +2893 -779
  113. followthemoney/translations/fr/LC_MESSAGES/followthemoney.mo +0 -0
  114. followthemoney/translations/fr/LC_MESSAGES/followthemoney.po +4362 -0
  115. followthemoney/translations/fr/followthemoney.po +3861 -0
  116. followthemoney/translations/messages.pot +3021 -725
  117. followthemoney/translations/nb/LC_MESSAGES/followthemoney.mo +0 -0
  118. followthemoney/translations/nb/LC_MESSAGES/followthemoney.po +3778 -0
  119. followthemoney/translations/nl/LC_MESSAGES/followthemoney.mo +0 -0
  120. followthemoney/translations/nl/LC_MESSAGES/followthemoney.po +3837 -0
  121. followthemoney/translations/pt_BR/LC_MESSAGES/followthemoney.mo +0 -0
  122. followthemoney/translations/pt_BR/LC_MESSAGES/followthemoney.po +3784 -0
  123. followthemoney/translations/ru/LC_MESSAGES/followthemoney.mo +0 -0
  124. followthemoney/translations/ru/LC_MESSAGES/followthemoney.po +2837 -539
  125. followthemoney/translations/ru/followthemoney.po +4221 -0
  126. followthemoney/translations/tr/LC_MESSAGES/followthemoney.mo +0 -0
  127. followthemoney/translations/tr/LC_MESSAGES/followthemoney.po +2073 -491
  128. followthemoney/types/__init__.py +35 -17
  129. followthemoney/types/address.py +41 -21
  130. followthemoney/types/checksum.py +25 -0
  131. followthemoney/types/common.py +233 -88
  132. followthemoney/types/country.py +89 -56
  133. followthemoney/types/date.py +59 -76
  134. followthemoney/types/email.py +66 -35
  135. followthemoney/types/entity.py +66 -13
  136. followthemoney/types/gender.py +66 -0
  137. followthemoney/types/iban.py +47 -28
  138. followthemoney/types/identifier.py +49 -22
  139. followthemoney/types/ip.py +35 -21
  140. followthemoney/types/json.py +58 -0
  141. followthemoney/types/language.py +124 -37
  142. followthemoney/types/mimetype.py +44 -0
  143. followthemoney/types/name.py +56 -12
  144. followthemoney/types/number.py +30 -0
  145. followthemoney/types/phone.py +92 -34
  146. followthemoney/types/registry.py +52 -0
  147. followthemoney/types/string.py +43 -0
  148. followthemoney/types/topic.py +94 -0
  149. followthemoney/types/url.py +39 -17
  150. followthemoney/util.py +139 -45
  151. followthemoney-3.8.0.dist-info/METADATA +153 -0
  152. followthemoney-3.8.0.dist-info/RECORD +157 -0
  153. {followthemoney-1.3.7.dist-info → followthemoney-3.8.0.dist-info}/WHEEL +1 -2
  154. followthemoney-3.8.0.dist-info/entry_points.txt +17 -0
  155. followthemoney-1.3.7.dist-info/LICENSE.txt → followthemoney-3.8.0.dist-info/licenses/LICENSE +1 -1
  156. followthemoney/link.py +0 -75
  157. followthemoney/schema/Associate.yml +0 -19
  158. followthemoney/schema/Family.yml +0 -19
  159. followthemoney/schema/Land.yml +0 -9
  160. followthemoney/schema/Relationship.yaml +0 -26
  161. followthemoney/types/domain.py +0 -50
  162. followthemoney-1.3.7.dist-info/DESCRIPTION.rst +0 -3
  163. followthemoney-1.3.7.dist-info/METADATA +0 -39
  164. followthemoney-1.3.7.dist-info/RECORD +0 -108
  165. followthemoney-1.3.7.dist-info/entry_points.txt +0 -3
  166. followthemoney-1.3.7.dist-info/metadata.json +0 -1
  167. followthemoney-1.3.7.dist-info/namespace_packages.txt +0 -1
  168. followthemoney-1.3.7.dist-info/top_level.txt +0 -3
  169. ns/ontology.py +0 -128
  170. tests/types/test_addresses.py +0 -24
  171. tests/types/test_common.py +0 -32
  172. tests/types/test_countries.py +0 -27
  173. tests/types/test_dates.py +0 -73
  174. tests/types/test_domains.py +0 -23
  175. tests/types/test_emails.py +0 -32
  176. tests/types/test_entity.py +0 -19
  177. tests/types/test_iban.py +0 -109
  178. tests/types/test_identifiers.py +0 -27
  179. tests/types/test_ip.py +0 -29
  180. tests/types/test_languages.py +0 -23
  181. tests/types/test_names.py +0 -33
  182. tests/types/test_phones.py +0 -24
  183. tests/types/test_registry.py +0 -14
  184. tests/types/test_urls.py +0 -23
  185. {ns → followthemoney/export}/__init__.py +0 -0
  186. /tests/types/__init__.py → /followthemoney/py.typed +0 -0
@@ -1,31 +1,54 @@
1
- from rdflib import URIRef
2
- from banal import ensure_list
3
- from phonenumbers import geocoder
1
+ from typing import Iterable, Optional, TYPE_CHECKING
4
2
  from phonenumbers import parse as parse_number
5
- from phonenumbers import is_possible_number, is_valid_number, format_number
6
- from phonenumbers import PhoneNumberFormat
7
- from phonenumbers.phonenumberutil import NumberParseException
3
+ from phonenumbers import is_valid_number, format_number
4
+ from phonenumbers import PhoneNumber, PhoneNumberFormat
5
+ from phonenumbers.phonenumberutil import region_code_for_number, NumberParseException
8
6
 
9
7
  from followthemoney.types.common import PropertyType
8
+ from followthemoney.rdf import URIRef, Identifier
9
+ from followthemoney.util import defer as _
10
+ from followthemoney.util import dampen
11
+
12
+ if TYPE_CHECKING:
13
+ from followthemoney.proxy import EntityProxy
14
+
15
+
16
+ # TODO: for json schema export
17
+ # https://stackoverflow.com/questions/6478875/regular-expression-matching-e-164-formatted-phone-numbers
10
18
 
11
19
 
12
20
  class PhoneType(PropertyType):
13
- name = 'phone'
14
- group = 'phones'
15
- prefix = 'tel'
16
- strong = False
17
-
18
- def _clean_countries(self, countries, country):
19
- result = set([None])
20
- countries = ensure_list(countries)
21
- countries.extend(ensure_list(country))
22
- for country in countries:
23
- if isinstance(country, str):
24
- country = country.strip().upper()
25
- result.add(country)
26
- return result
27
-
28
- def clean_text(self, number, countries=None, country=None, **kwargs):
21
+ """A phone number in E.164 format. This means that phone numbers always
22
+ include an international country prefix (e.g. `+38760183628`). The
23
+ cleaning and validation functions for this try to be smart about by
24
+ accepting a list of countries as an argument in order to add the number
25
+ prefix.
26
+
27
+ When adding a property of this type to an entity, any country-type properties
28
+ defined for the entity are considered for validation. That means that adding a
29
+ phone number to an entity before adding a country can have a different
30
+ validation outcome from doing the two operations the other way around. Always
31
+ define the country first."""
32
+
33
+ name = "phone"
34
+ group = "phones"
35
+ label = _("Phone number")
36
+ plural = _("Phone numbers")
37
+ matchable = True
38
+ pivot = True
39
+ max_length = 64
40
+
41
+ def _clean_countries(
42
+ self, proxy: Optional["EntityProxy"]
43
+ ) -> Iterable[Optional[str]]:
44
+ yield None
45
+ if proxy is not None:
46
+ for country in proxy.countries:
47
+ yield country.upper()
48
+
49
+ def _parse_number(
50
+ self, number: str, proxy: Optional["EntityProxy"] = None
51
+ ) -> Iterable[PhoneNumber]:
29
52
  """Parse a phone number and return in international format.
30
53
 
31
54
  If no valid phone number can be detected, None is returned. If
@@ -34,24 +57,59 @@ class PhoneType(PropertyType):
34
57
 
35
58
  https://github.com/daviddrysdale/python-phonenumbers
36
59
  """
37
- for code in self._clean_countries(countries, country):
60
+ for code in self._clean_countries(proxy):
38
61
  try:
39
- num = parse_number(number, code)
40
- if is_possible_number(num):
41
- if is_valid_number(num):
42
- return format_number(num, PhoneNumberFormat.E164)
62
+ yield parse_number(number, code)
43
63
  except NumberParseException:
44
64
  pass
45
65
 
46
- def specificity(self, value):
47
- return 1
66
+ def validate(
67
+ self, value: str, fuzzy: bool = False, format: Optional[str] = None
68
+ ) -> bool:
69
+ for num in self._parse_number(value):
70
+ if is_valid_number(num):
71
+ return True
72
+ return False
73
+
74
+ def clean_text(
75
+ self,
76
+ text: str,
77
+ fuzzy: bool = False,
78
+ format: Optional[str] = None,
79
+ proxy: Optional["EntityProxy"] = None,
80
+ ) -> Optional[str]:
81
+ for num in self._parse_number(text, proxy=proxy):
82
+ if is_valid_number(num):
83
+ return str(format_number(num, PhoneNumberFormat.E164))
84
+ return None
48
85
 
49
- def country_hint(self, value):
86
+ def country_hint(self, value: str) -> Optional[str]:
50
87
  try:
51
88
  number = parse_number(value)
52
- return geocoder.region_code_for_number(number).lower()
89
+ code = region_code_for_number(number)
90
+ if code is None:
91
+ return None
92
+ return str(code).lower()
53
93
  except NumberParseException:
54
- pass
94
+ return None
95
+
96
+ def _specificity(self, value: str) -> float:
97
+ # TODO: insert artificial intelligence here.
98
+ return dampen(7, 11, value)
99
+
100
+ def rdf(self, value: str) -> Identifier:
101
+ node_id = self.node_id(value)
102
+ if node_id is not None:
103
+ return URIRef(node_id)
104
+ raise ValueError("Invalid phone number for serialisation: %s" % value)
105
+
106
+ def node_id(self, value: str) -> Optional[str]:
107
+ return f"tel:{value}"
55
108
 
56
- def rdf(self, value):
57
- return URIRef('tel:%s' % value)
109
+ def caption(self, value: str) -> str:
110
+ try:
111
+ number = parse_number(value)
112
+ formatted = format_number(number, PhoneNumberFormat.INTERNATIONAL)
113
+ return str(formatted)
114
+ except NumberParseException:
115
+ return value
@@ -0,0 +1,52 @@
1
+ from banal import ensure_list
2
+ from typing import Iterable, Set, Dict, Type, Union, List, Optional
3
+
4
+ from followthemoney.types.common import PropertyType
5
+
6
+
7
+ class Registry(object):
8
+ """This registry keeps the processing helpers for all property types
9
+ in the system. They are instantiated as singletons when the system is first
10
+ loaded. The registry can be used to get a type, which can itself then
11
+ clean, validate or format values of that type."""
12
+
13
+ def __init__(self) -> None:
14
+ self.named: Dict[str, PropertyType] = {}
15
+ self.matchable: Set[PropertyType] = set()
16
+ self.types: Set[PropertyType] = set()
17
+ self.groups: Dict[str, PropertyType] = {}
18
+ self.pivots: Set[PropertyType] = set()
19
+
20
+ def add(self, clazz: Type[PropertyType]) -> None:
21
+ """Add a singleton class."""
22
+ type_ = clazz()
23
+ self.named[clazz.name] = type_
24
+ self.types.add(type_)
25
+ if type_.matchable:
26
+ self.matchable.add(type_)
27
+ if type_.pivot:
28
+ self.pivots.add(type_)
29
+ if type_.group is not None:
30
+ self.groups[type_.group] = type_
31
+
32
+ def get(self, name: Union[str, PropertyType]) -> Optional[PropertyType]:
33
+ """For a given property type name, get its type object. This can also
34
+ be used via getattr, e.g. ``registry.phone``."""
35
+ # Allow transparent re-checking.
36
+ if isinstance(name, PropertyType):
37
+ return name
38
+ return self.named.get(name)
39
+
40
+ def get_types(
41
+ self, names: Iterable[Union[str, PropertyType]]
42
+ ) -> List[PropertyType]:
43
+ """Get a list of all type names."""
44
+ names = ensure_list(names)
45
+ types = [self.get(n) for n in names]
46
+ return [t for t in types if t is not None]
47
+
48
+ def __getitem__(self, name: str) -> PropertyType:
49
+ return self.named[name]
50
+
51
+ def __getattr__(self, name: str) -> PropertyType:
52
+ return self.named[name]
@@ -0,0 +1,43 @@
1
+ from followthemoney.types.common import PropertyType
2
+ from followthemoney.util import defer as _
3
+ from followthemoney.util import MEGABYTE
4
+
5
+
6
+ class StringType(PropertyType):
7
+ """A simple string property with no additional semantics."""
8
+
9
+ name = "string"
10
+ label = _("Label")
11
+ plural = _("Labels")
12
+ matchable = False
13
+ max_length = 1024
14
+
15
+ def node_id(self, value: str) -> None:
16
+ return None
17
+
18
+
19
+ class TextType(StringType):
20
+ """Longer text fragments, such as descriptions or document text. Unlike
21
+ string properties, it might make sense to treat properties of this type as
22
+ full-text search material."""
23
+
24
+ name = "text"
25
+ label = _("Text")
26
+ plural = _("Texts")
27
+ total_size = 30 * MEGABYTE
28
+ max_length = 65000
29
+
30
+
31
+ class HTMLType(StringType):
32
+ """Properties that contain raw hypertext markup (HTML).
33
+
34
+ User interfaces rendering properties of this type need to take extreme
35
+ care not to allow attacks such as cross-site scripting. It is recommended
36
+ to perform server-side sanitisation, or to not render this property at all.
37
+ """
38
+
39
+ name = "html"
40
+ label = _("HTML")
41
+ plural = _("HTMLs")
42
+ total_size = 30 * MEGABYTE
43
+ max_length = 65000
@@ -0,0 +1,94 @@
1
+ from babel.core import Locale
2
+
3
+ from followthemoney.types.common import EnumType, EnumValues
4
+ from followthemoney.rdf import URIRef, Identifier
5
+ from followthemoney.util import gettext, defer as _
6
+
7
+
8
+ class TopicType(EnumType):
9
+ """Topics define a controlled vocabulary of terms applicable to some
10
+ entities, such as companies and people. They describe categories of
11
+ journalistic interest which may apply to the given entity, for example
12
+ if a given person is a criminal or a politician.
13
+
14
+ Besides the informative value, topics are ultimately supposed to bear
15
+ fruits in the context of graph-based data analysis, where they would
16
+ enable queries such as _find all paths between a government procurement
17
+ award and a politician_."""
18
+
19
+ name = "topic"
20
+ group = "topics"
21
+ label = _("Topic")
22
+ plural = _("Topics")
23
+ matchable = False
24
+ max_length = 64
25
+
26
+ _TOPICS = {
27
+ "crime": _("Crime"),
28
+ "crime.fraud": _("Fraud"),
29
+ "crime.cyber": _("Cybercrime"),
30
+ "crime.fin": _("Financial crime"),
31
+ "crime.env": _("Environmental violations"),
32
+ "crime.theft": _("Theft"),
33
+ "crime.war": _("War crimes"),
34
+ "crime.boss": _("Criminal leadership"),
35
+ "crime.terror": _("Terrorism"),
36
+ "crime.traffick": _("Trafficking"),
37
+ "crime.traffick.drug": _("Drug trafficking"),
38
+ "crime.traffick.human": _("Human trafficking"),
39
+ "wanted": _("Wanted"),
40
+ "corp.offshore": _("Offshore"),
41
+ "corp.shell": _("Shell company"),
42
+ "corp.public": _("Public listed company"),
43
+ "corp.disqual": _("Disqualified"),
44
+ "gov": _("Government"),
45
+ "gov.national": _("National government"),
46
+ "gov.state": _("State government"),
47
+ "gov.muni": _("Municipal government"),
48
+ "gov.soe": _("State-owned enterprise"),
49
+ "gov.igo": _("Intergovernmental organization"),
50
+ "gov.head": _("Head of government or state"),
51
+ "gov.admin": _("Civil service"),
52
+ "gov.executive": _("Executive branch of government"),
53
+ "gov.legislative": _("Legislative branch of government"),
54
+ "gov.judicial": _("Judicial branch of government"),
55
+ "gov.security": _("Security services"),
56
+ "gov.financial": _("Central banking and financial integrity"),
57
+ "fin": _("Financial services"),
58
+ "fin.bank": _("Bank"),
59
+ "fin.fund": _("Fund"),
60
+ "fin.adivsor": _("Financial advisor"),
61
+ "reg.action": _("Regulator action"),
62
+ "reg.warn": _("Regulator warning"),
63
+ "role.pep": _("Politician"),
64
+ "role.pol": _("Non-PEP"),
65
+ "role.rca": _("Close Associate"),
66
+ "role.judge": _("Judge"),
67
+ "role.civil": _("Civil servant"),
68
+ "role.diplo": _("Diplomat"),
69
+ "role.lawyer": _("Lawyer"),
70
+ "role.acct": _("Accountant"),
71
+ "role.spy": _("Spy"),
72
+ "role.oligarch": _("Oligarch"),
73
+ "role.journo": _("Journalist"),
74
+ "role.act": _("Activist"),
75
+ "role.lobby": _("Lobbyist"),
76
+ "pol.party": _("Political party"),
77
+ "pol.union": _("Union"),
78
+ "rel": _("Religion"),
79
+ "mil": _("Military"),
80
+ "asset.frozen": _("Frozen asset"),
81
+ "sanction": _("Sanctioned entity"),
82
+ "sanction.linked": _("Sanction-linked entity"),
83
+ "sanction.counter": _("Counter-sanctioned entity"),
84
+ "export.control": _("Export controlled"),
85
+ "export.risk": _("Trade risk"),
86
+ "debarment": _("Debarred entity"),
87
+ "poi": _("Person of interest"),
88
+ }
89
+
90
+ def _locale_names(self, locale: Locale) -> EnumValues:
91
+ return {k: gettext(v) for (k, v) in self._TOPICS.items()}
92
+
93
+ def rdf(self, value: str) -> Identifier:
94
+ return URIRef(f"ftm:topic:{value}")
@@ -1,27 +1,49 @@
1
- from rdflib import URIRef
2
- from urlnormalizer import normalize_url, is_valid_url
1
+ from typing import Optional, TYPE_CHECKING
2
+ from rigour.urls import clean_url, compare_urls
3
3
 
4
4
  from followthemoney.types.common import PropertyType
5
+ from followthemoney.rdf import URIRef, Identifier
6
+ from followthemoney.util import dampen, defer as _
7
+
8
+ if TYPE_CHECKING:
9
+ from followthemoney.proxy import EntityProxy
5
10
 
6
11
 
7
12
  class UrlType(PropertyType):
8
- name = 'url'
9
- group = 'urls'
10
- prefix = 'url'
13
+ """A uniform resource locator (URL). This will perform some normalisation
14
+ on the URL so that it's sure to be using valid encoding/quoting, and to
15
+ make sure the URL has a schema (e.g. `http`, `https`, ...)."""
16
+
17
+ SCHEMES = ("http", "https", "ftp", "mailto")
18
+ DEFAULT_SCHEME = "http"
11
19
 
12
- def validate(self, url, **kwargs):
13
- """Check if `url` is a valid URL."""
14
- return is_valid_url(url)
20
+ name = "url"
21
+ group = "urls"
22
+ label = _("URL")
23
+ plural = _("URLs")
24
+ matchable = True
25
+ pivot = True
26
+ max_length = 4096
15
27
 
16
- def clean_text(self, url, **kwargs):
17
- """Perform intensive care on URLs, see `urlnormalizer`."""
18
- try:
19
- return normalize_url(url)
20
- except UnicodeDecodeError:
21
- return None
28
+ def clean_text(
29
+ self,
30
+ text: str,
31
+ fuzzy: bool = False,
32
+ format: Optional[str] = None,
33
+ proxy: Optional["EntityProxy"] = None,
34
+ ) -> Optional[str]:
35
+ """Perform intensive care on URLs to make sure they have a scheme
36
+ and a host name. If no scheme is given HTTP is assumed."""
37
+ return clean_url(text)
22
38
 
23
- def specificity(self, value):
24
- return 1
39
+ def compare(self, left: str, right: str) -> float:
40
+ return compare_urls(left, right)
25
41
 
26
- def rdf(self, value):
42
+ def _specificity(self, value: str) -> float:
43
+ return dampen(10, 120, value)
44
+
45
+ def rdf(self, value: str) -> Identifier:
27
46
  return URIRef(value)
47
+
48
+ def node_id(self, value: str) -> Optional[str]:
49
+ return f"url:{value}"
followthemoney/util.py CHANGED
@@ -1,63 +1,157 @@
1
1
  import os
2
- from threading import local
3
- from normality import stringify
2
+ import logging
3
+ from hashlib import sha1
4
4
  from babel import Locale
5
5
  from gettext import translation
6
- from rdflib import Namespace
7
- from banal import is_mapping, is_sequence
8
- from banal import unique_list, ensure_list
9
6
 
10
- NAMESPACE = Namespace('https://w3id.org/ftm#')
11
- DEFAULT_LOCALE = 'en'
12
- i18n_path = os.path.join(os.path.dirname(__file__), 'translations')
7
+ from threading import local
8
+ from typing import cast, Dict, Any, List, Optional, TypeVar, Union, Sequence
9
+ from normality import stringify
10
+ from normality.cleaning import compose_nfc
11
+ from normality.cleaning import remove_unsafe_chars
12
+ from normality.encoding import DEFAULT_ENCODING
13
+ from banal import is_mapping, unique_list, ensure_list
14
+
15
+ MEGABYTE = 1024 * 1024
16
+ DEFAULT_LOCALE = "en"
17
+ ENTITY_ID_LEN = 200
18
+
19
+ T = TypeVar("T")
20
+ K = TypeVar("K")
21
+ V = TypeVar("V")
22
+
23
+ PathLike = Union[str, os.PathLike[str]]
24
+ i18n_path = os.path.join(os.path.dirname(__file__), "translations")
13
25
  state = local()
26
+ log = logging.getLogger(__name__)
27
+
14
28
 
29
+ def gettext(*args: Optional[str], **kwargs: Dict[str, str]) -> str:
30
+ if not hasattr(state, "translation"):
31
+ set_model_locale(Locale.parse(DEFAULT_LOCALE))
32
+ return cast(str, state.translation.gettext(*args, **kwargs))
15
33
 
16
- def gettext(*args, **kwargs):
17
- if not hasattr(state, 'translation'):
18
- set_model_locale(DEFAULT_LOCALE)
19
- return state.translation.gettext(*args, **kwargs)
20
34
 
35
+ def defer(text: str) -> str:
36
+ return text
21
37
 
22
- def set_model_locale(locale):
38
+
39
+ def set_model_locale(locale: Locale) -> None:
23
40
  state.locale = locale
24
- state.translation = translation('followthemoney', i18n_path, [locale],
25
- fallback=True)
41
+ state.translation = translation(
42
+ "followthemoney", i18n_path, [str(locale)], fallback=True
43
+ )
44
+
45
+
46
+ def get_locale() -> Locale:
47
+ if not hasattr(state, "locale"):
48
+ return Locale.parse(DEFAULT_LOCALE)
49
+ return Locale.parse(state.locale)
50
+
51
+
52
+ def get_env_list(name: str, default: List[str] = []) -> List[str]:
53
+ value = stringify(os.environ.get(name))
54
+ if value is not None:
55
+ values = value.split(":")
56
+ if len(values):
57
+ return values
58
+ return default
59
+
60
+
61
+ def sanitize_text(text: Any, encoding: str = DEFAULT_ENCODING) -> Optional[str]:
62
+ text = stringify(text, encoding_default=encoding)
63
+ if text is None:
64
+ return None
65
+ try:
66
+ text = compose_nfc(text)
67
+ except (SystemError, Exception) as ex:
68
+ log.warning("Cannot NFC text: %s", ex)
69
+ return None
70
+ text = remove_unsafe_chars(text)
71
+ if text is None:
72
+ return None
73
+ byte_text = text.encode(DEFAULT_ENCODING, "replace")
74
+ return cast(str, byte_text.decode(DEFAULT_ENCODING, "replace"))
26
75
 
27
76
 
28
- def get_locale():
29
- if not hasattr(state, 'locale'):
30
- return Locale(DEFAULT_LOCALE)
31
- return Locale(state.locale)
77
+ def value_list(value: Union[T, Sequence[T]]) -> List[T]:
78
+ if not isinstance(value, (str, bytes)):
79
+ try:
80
+ return [v for v in cast(Sequence[T], value)]
81
+ except TypeError:
82
+ pass
83
+ return [cast(T, value)]
32
84
 
33
85
 
34
- def key_bytes(key):
86
+ def key_bytes(key: Any) -> bytes:
35
87
  """Convert the given data to a value appropriate for hashing."""
36
- key = stringify(key) or ''
37
- return key.encode('utf-8')
38
-
39
-
40
- def merge_data(old, new):
41
- """Extend the values of the new doc with extra values from the old."""
42
- if is_sequence(old) or is_sequence(new):
43
- new = ensure_list(new)
44
- new.extend(ensure_list(old))
45
- return unique_list(new)
46
- if is_mapping(old) or is_mapping(new):
47
- old = old if is_mapping(old) else {}
48
- new = new if is_mapping(new) else {}
49
- keys = set(new.keys())
50
- keys.update(old.keys())
51
- combined = {}
52
- for key in keys:
53
- value = merge_data(old.get(key), new.get(key))
54
- if value is not None:
55
- combined[key] = value
56
- return combined
57
- return new or old
58
-
59
-
60
- def dampen(short, long, text):
88
+ if isinstance(key, bytes):
89
+ return key
90
+ text = stringify(key)
91
+ if text is None:
92
+ return b""
93
+ return text.encode("utf-8")
94
+
95
+
96
+ def join_text(*parts: Any, sep: str = " ") -> Optional[str]:
97
+ """Join all the non-null arguments using sep."""
98
+ texts: List[str] = []
99
+ for part in parts:
100
+ text = stringify(part)
101
+ if text is not None:
102
+ texts.append(text)
103
+ if not len(texts):
104
+ return None
105
+ return sep.join(texts)
106
+
107
+
108
+ def get_entity_id(obj: Any) -> Optional[str]:
109
+ """Given an entity-ish object, try to get the ID."""
110
+ if is_mapping(obj):
111
+ obj = obj.get("id")
112
+ else:
113
+ try:
114
+ obj = obj.id
115
+ except AttributeError:
116
+ pass
117
+ return stringify(obj)
118
+
119
+
120
+ def make_entity_id(*parts: Any, key_prefix: Optional[str] = None) -> Optional[str]:
121
+ digest = sha1()
122
+ if key_prefix:
123
+ digest.update(key_bytes(key_prefix))
124
+ base = digest.digest()
125
+ for part in parts:
126
+ digest.update(key_bytes(part))
127
+ if digest.digest() == base:
128
+ return None
129
+ return digest.hexdigest()
130
+
131
+
132
+ def merge_context(left: Dict[K, V], right: Dict[K, V]) -> Dict[K, List[V]]:
133
+ """When merging two entities, make lists of all the duplicate context
134
+ keys."""
135
+ combined = {}
136
+ keys = [*left.keys(), *right.keys()]
137
+ for key in set(keys):
138
+ if key in ("caption",):
139
+ continue
140
+ lval: List[V] = [i for i in ensure_list(left.get(key)) if i is not None]
141
+ rval: List[V] = [i for i in ensure_list(right.get(key)) if i is not None]
142
+ combined[key] = unique_list([*lval, *rval])
143
+ return combined
144
+
145
+
146
+ def dampen(short: int, long: int, text: str) -> float:
61
147
  length = len(text) - short
62
148
  baseline = max(1.0, (long - short))
63
149
  return max(0, min(1.0, (length / baseline)))
150
+
151
+
152
+ def shortest(*texts: str) -> str:
153
+ return min(texts, key=len)
154
+
155
+
156
+ def longest(*texts: str) -> str:
157
+ return max(texts, key=len)