@pencil-agent/nano-pencil 2.0.0-beta.8 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (241) hide show
  1. package/README.md +267 -267
  2. package/dist/build-meta.json +3 -3
  3. package/dist/core/export-html/AGENT.md +11 -11
  4. package/dist/core/export-html/template.css +971 -971
  5. package/dist/core/export-html/template.html +54 -54
  6. package/dist/core/extensions-host/index.d.ts +1 -1
  7. package/dist/core/extensions-host/loader.js +1 -1
  8. package/dist/core/extensions-host/runner.d.ts +1 -0
  9. package/dist/core/extensions-host/runner.js +2 -2
  10. package/dist/core/extensions-host/types.d.ts +17 -22
  11. package/dist/core/lib/ai/src/types.d.ts +12 -2
  12. package/dist/core/persona/persona-manager.js +5 -2
  13. package/dist/core/runtime/agent-session.js +3 -3
  14. package/dist/core/runtime/extension-core-bindings.d.ts +1 -0
  15. package/dist/core/runtime/extension-core-bindings.js +2 -2
  16. package/dist/extensions/builtin/AGENT.md +115 -115
  17. package/dist/extensions/builtin/browser/AGENT.md +17 -17
  18. package/dist/extensions/builtin/browser/agent-workspace/agent_helpers.py +12 -12
  19. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/amazon/product-search.md +198 -198
  20. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/archive-org/scraping.md +341 -341
  21. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/arxiv/scraping.md +311 -311
  22. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/arxiv-bulk/scraping.md +333 -333
  23. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/atlas/overview.md +70 -70
  24. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/booking-com/scraping.md +578 -578
  25. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/capterra/scraping.md +440 -440
  26. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/centilebrain/generate-estimates.md +110 -110
  27. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/coingecko/scraping.md +325 -325
  28. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/coinmarketcap/scraping.md +463 -463
  29. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/coursera/scraping.md +360 -360
  30. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/craigslist/scraping.md +390 -390
  31. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/crossref/scraping.md +568 -568
  32. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/dev-to/scraping.md +323 -323
  33. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/duckduckgo/scraping.md +349 -349
  34. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/ebay/scraping.md +435 -435
  35. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/etsy/scraping.md +506 -506
  36. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/eventbrite/scraping.md +363 -363
  37. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/expedia/automation.md +168 -168
  38. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/facebook/groups.md +236 -236
  39. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/facebook/pages.md +295 -295
  40. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/framer/editor.md +108 -108
  41. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/fred/scraping.md +493 -493
  42. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/g2/scraping.md +580 -580
  43. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/genius/scraping.md +511 -511
  44. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/github/repo-actions.md +65 -65
  45. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/github/scraping.md +184 -184
  46. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/glassdoor/scraping.md +543 -543
  47. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/gmail/compose.md +122 -122
  48. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/goodreads/scraping.md +461 -461
  49. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/gutenberg/scraping.md +383 -383
  50. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/hackernews/scraping.md +243 -243
  51. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/howlongtobeat/scraping.md +473 -473
  52. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/imdb/scraping.md +271 -271
  53. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/itch-io/scraping.md +436 -436
  54. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/job-boards/indeed-glassdoor.md +1021 -1021
  55. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/letterboxd/scraping.md +349 -349
  56. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/linkedin/invitation-manager.md +109 -109
  57. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/loom/folder-enumeration.md +170 -170
  58. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/macrotrends/scraping.md +537 -537
  59. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/medium/article-hydration.md +120 -120
  60. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/medium/scraping.md +414 -414
  61. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/metacritic/scraping.md +477 -477
  62. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/musicbrainz/scraping.md +478 -478
  63. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/nasa/scraping.md +339 -339
  64. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/news-aggregation/multi-source.md +205 -205
  65. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/open-library/scraping.md +472 -472
  66. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/openalex/scraping.md +470 -470
  67. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/openstreetmap/scraping.md +490 -490
  68. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/package-registries/npm-pypi.md +478 -478
  69. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/polymarket/scraping.md +234 -234
  70. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/producthunt/scraping.md +307 -307
  71. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/pubmed/scraping.md +421 -421
  72. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/quora/scraping.md +364 -364
  73. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/rawg/scraping.md +352 -352
  74. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/reddit/scraping.md +124 -124
  75. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/rest-countries/scraping.md +233 -233
  76. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/sec-edgar/scraping.md +361 -361
  77. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/shopify-admin/README.md +36 -36
  78. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/shopify-admin/embedded-apps.md +72 -72
  79. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/shopify-admin/knowledge-base.md +109 -109
  80. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/shopify-admin/polaris-inputs.md +137 -137
  81. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/soundcloud/scraping.md +362 -362
  82. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/spotify/scraping.md +339 -339
  83. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/stackoverflow/scraping.md +435 -435
  84. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/steam/scraping.md +575 -575
  85. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/substack/scraping.md +338 -338
  86. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/thetechgeeks/pricing.md +52 -52
  87. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/tiktok/upload.md +107 -107
  88. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/tradingview/scraping.md +309 -309
  89. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/trello/boards-and-lists.md +88 -88
  90. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/trustpilot/scraping.md +375 -375
  91. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/walmart/scraping.md +444 -444
  92. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/wayback-machine/scraping.md +306 -306
  93. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/weather/scraping.md +398 -398
  94. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/wellfound/scraping.md +596 -596
  95. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/world-bank/scraping.md +356 -356
  96. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/xiaohongshu/scraping.md +84 -84
  97. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/youtube/scraping.md +418 -418
  98. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/zillow/scraping.md +433 -433
  99. package/dist/extensions/builtin/browser/browser.md +73 -73
  100. package/dist/extensions/builtin/browser/install.md +142 -142
  101. package/dist/extensions/builtin/browser/interaction-skills/connection.md +48 -48
  102. package/dist/extensions/builtin/browser/interaction-skills/cookies.md +3 -3
  103. package/dist/extensions/builtin/browser/interaction-skills/cross-origin-iframes.md +3 -3
  104. package/dist/extensions/builtin/browser/interaction-skills/dialogs.md +64 -64
  105. package/dist/extensions/builtin/browser/interaction-skills/downloads.md +3 -3
  106. package/dist/extensions/builtin/browser/interaction-skills/drag-and-drop.md +3 -3
  107. package/dist/extensions/builtin/browser/interaction-skills/dropdowns.md +3 -3
  108. package/dist/extensions/builtin/browser/interaction-skills/iframes.md +3 -3
  109. package/dist/extensions/builtin/browser/interaction-skills/network-requests.md +3 -3
  110. package/dist/extensions/builtin/browser/interaction-skills/print-as-pdf.md +3 -3
  111. package/dist/extensions/builtin/browser/interaction-skills/profile-sync.md +90 -90
  112. package/dist/extensions/builtin/browser/interaction-skills/screenshots.md +17 -17
  113. package/dist/extensions/builtin/browser/interaction-skills/scrolling.md +3 -3
  114. package/dist/extensions/builtin/browser/interaction-skills/shadow-dom.md +3 -3
  115. package/dist/extensions/builtin/browser/interaction-skills/tabs.md +69 -69
  116. package/dist/extensions/builtin/browser/interaction-skills/uploads.md +1 -1
  117. package/dist/extensions/builtin/browser/interaction-skills/viewport.md +3 -3
  118. package/dist/extensions/builtin/browser/src/browser_harness/AGENT.md +15 -15
  119. package/dist/extensions/builtin/browser/src/browser_harness/__init__.py +8 -8
  120. package/dist/extensions/builtin/browser/src/browser_harness/_ipc.py +90 -90
  121. package/dist/extensions/builtin/browser/src/browser_harness/admin.py +722 -722
  122. package/dist/extensions/builtin/browser/src/browser_harness/daemon.py +328 -328
  123. package/dist/extensions/builtin/browser/src/browser_harness/helpers.py +396 -396
  124. package/dist/extensions/builtin/browser/src/browser_harness/run.py +103 -103
  125. package/dist/extensions/builtin/discipline/skills/brainstorming/SKILL.md +33 -33
  126. package/dist/extensions/builtin/discipline/skills/executing-plans/SKILL.md +25 -25
  127. package/dist/extensions/builtin/discipline/skills/finishing-development-branch/SKILL.md +25 -25
  128. package/dist/extensions/builtin/discipline/skills/receiving-code-review/SKILL.md +22 -22
  129. package/dist/extensions/builtin/discipline/skills/requesting-code-review/SKILL.md +31 -31
  130. package/dist/extensions/builtin/discipline/skills/systematic-debugging/SKILL.md +28 -28
  131. package/dist/extensions/builtin/discipline/skills/test-driven-development/SKILL.md +32 -32
  132. package/dist/extensions/builtin/discipline/skills/using-git-worktrees/SKILL.md +25 -25
  133. package/dist/extensions/builtin/discipline/skills/verification-before-completion/SKILL.md +27 -27
  134. package/dist/extensions/builtin/discipline/skills/writing-plans/SKILL.md +26 -26
  135. package/dist/extensions/builtin/goal/README.md +67 -67
  136. package/dist/extensions/builtin/goal/goal-controller.d.ts +39 -10
  137. package/dist/extensions/builtin/goal/goal-controller.js +1 -1
  138. package/dist/extensions/builtin/goal/goal-format.js +1 -1
  139. package/dist/extensions/builtin/goal/goal-prompts.d.ts +2 -0
  140. package/dist/extensions/builtin/goal/goal-prompts.js +5 -4
  141. package/dist/extensions/builtin/goal/goal-store.js +1 -1
  142. package/dist/extensions/builtin/goal/index.d.ts +1 -1
  143. package/dist/extensions/builtin/goal/index.js +10 -7
  144. package/dist/extensions/builtin/grub/README.md +112 -112
  145. package/dist/extensions/builtin/link-world/agent-workspace/README.md +16 -16
  146. package/dist/extensions/builtin/link-world/index.js +6 -6
  147. package/dist/extensions/builtin/link-world/internet-search/internet-search.md +65 -65
  148. package/dist/extensions/builtin/link-world/link-world-agent.md +82 -82
  149. package/dist/extensions/builtin/link-world/linkworld.md +313 -313
  150. package/dist/extensions/builtin/link-world/{network-routing.md → network-routing/network-routing.md} +67 -67
  151. package/dist/extensions/builtin/loop/README.md +92 -92
  152. package/dist/extensions/builtin/mcp/figma-design.md +68 -68
  153. package/dist/extensions/builtin/mcp/mcp-management.md +85 -85
  154. package/dist/extensions/builtin/plan/index.js +1 -1
  155. package/dist/extensions/builtin/recap/AGENT.md +15 -15
  156. package/dist/extensions/builtin/sal/README.md +72 -72
  157. package/dist/extensions/builtin/security-audit/README.md +289 -289
  158. package/dist/extensions/builtin/task/task-store.d.ts +4 -0
  159. package/dist/extensions/builtin/task/task-store.js +1 -1
  160. package/dist/extensions/builtin/team/AGENT.md +112 -112
  161. package/dist/extensions/builtin/team/TESTING.md +299 -299
  162. package/dist/extensions/builtin/token-save/README.md +56 -56
  163. package/dist/extensions/optional/AGENT.md +10 -10
  164. package/dist/index.d.ts +5 -30
  165. package/dist/index.js +1 -1
  166. package/dist/models.d.ts +7 -0
  167. package/dist/models.js +1 -0
  168. package/dist/modes/interactive/components/footer.js +1 -1
  169. package/dist/modes/interactive/components/task-status-panel.d.ts +36 -0
  170. package/dist/modes/interactive/components/task-status-panel.js +1 -0
  171. package/dist/modes/interactive/controllers/stream-render-controller.d.ts +7 -0
  172. package/dist/modes/interactive/controllers/stream-render-controller.js +2 -2
  173. package/dist/modes/interactive/interactive-mode.js +40 -40
  174. package/dist/modes/interactive/state/interactive-state.d.ts +2 -0
  175. package/dist/modes/interactive/state/interactive-state.js +1 -1
  176. package/dist/modes/interactive/theme/dark.json +85 -85
  177. package/dist/modes/interactive/theme/light.json +84 -84
  178. package/dist/modes/interactive/theme/theme-schema.json +335 -335
  179. package/dist/modes/interactive/theme/warm.json +81 -81
  180. package/dist/node_modules/@pencil-agent/ai/dist/cli.js +0 -0
  181. package/dist/node_modules/@pencil-agent/ai/dist/models.generated.js +1 -1
  182. package/dist/node_modules/@pencil-agent/ai/dist/providers/anthropic.js +2 -2
  183. package/dist/node_modules/@pencil-agent/ai/dist/providers/openai-completions.js +5 -5
  184. package/dist/node_modules/@pencil-agent/ai/dist/providers/openai-responses.js +1 -1
  185. package/dist/node_modules/@pencil-agent/ai/dist/stream.js +1 -1
  186. package/dist/packages/protocol/src/commands.d.ts +33 -0
  187. package/dist/packages/protocol/src/flags.d.ts +20 -0
  188. package/dist/packages/protocol/src/hooks.d.ts +17 -0
  189. package/dist/packages/protocol/src/hooks.js +0 -0
  190. package/dist/packages/{extension-sdk → protocol}/src/index.d.ts +7 -4
  191. package/dist/packages/protocol/src/index.js +1 -0
  192. package/dist/packages/{extension-sdk → protocol}/src/lifecycle.d.ts +15 -27
  193. package/dist/packages/protocol/src/lifecycle.js +0 -0
  194. package/dist/packages/{extension-sdk → protocol}/src/tools.d.ts +1 -1
  195. package/dist/packages/protocol/src/tools.js +0 -0
  196. package/dist/public-config.d.ts +12 -0
  197. package/dist/public-config.js +1 -0
  198. package/dist/runtime.d.ts +9 -0
  199. package/dist/runtime.js +1 -0
  200. package/dist/session-compaction.d.ts +7 -0
  201. package/dist/session-compaction.js +1 -0
  202. package/dist/session.d.ts +7 -0
  203. package/dist/session.js +1 -0
  204. package/dist/skills.d.ts +7 -0
  205. package/dist/skills.js +1 -0
  206. package/dist/tools.d.ts +7 -0
  207. package/dist/tools.js +1 -0
  208. package/docs/ACP/345/215/217/350/256/256/351/233/206/346/210/220/345/274/200/345/217/221/346/226/207/346/241/243.md +851 -0
  209. package/docs/SDK-TESTING.md +364 -0
  210. package/docs/codex-goal-command-impl.md +1055 -1055
  211. package/docs/codex-goal-vs-grub.md +500 -500
  212. package/docs/custom-provider.md +27 -27
  213. package/docs/extensions.md +27 -27
  214. package/docs/keybindings.md +27 -27
  215. package/docs/loop /351/207/215/346/236/204/345/256/214/346/210/220/346/200/273/347/273/223.md" +250 -250
  216. package/docs/loop /351/207/215/346/236/204/345/256/214/346/210/220/346/212/245/345/221/212.md" +122 -122
  217. package/docs/loop /351/207/215/346/236/204/346/226/271/346/241/210.md" +1222 -1222
  218. package/docs/loop /351/207/215/346/236/204/346/226/271/346/241/210/345/256/236/347/216/260/346/212/245/345/221/212.md" +158 -158
  219. package/docs/loop /351/207/215/346/236/204/346/226/271/346/241/210/345/257/271/346/257/224/345/210/206/346/236/220.md" +128 -128
  220. package/docs/loop /351/207/215/346/236/204/350/256/241/345/210/222.md" +320 -320
  221. package/docs/loop-usage-examples.md +214 -214
  222. package/docs/mem-core/346/212/200/346/234/257/346/226/207/346/241/243.md +593 -0
  223. package/docs/models.md +27 -27
  224. package/docs/packages.md +27 -27
  225. package/docs/pi-design-philosophy.md +457 -457
  226. package/docs/planmode.md +1987 -1987
  227. package/docs/prompt-templates.md +27 -27
  228. package/docs/providers.md +27 -27
  229. package/docs/sdk.md +27 -27
  230. package/docs/skills.md +27 -27
  231. package/docs/startup-performance-optimization.md +301 -0
  232. package/docs/themes.md +27 -27
  233. package/docs/tui.md +27 -27
  234. package/docs//350/256/244/347/237/245/345/234/260/345/233/276.md +47 -0
  235. package/package.json +190 -162
  236. package/dist/packages/extension-sdk/src/index.js +0 -1
  237. package/docs/cc-agent-design.md +0 -1297
  238. package/docs/cc-tui-design.md +0 -1333
  239. package/docs//345/257/271/346/240/207Claude-Code.md +0 -1775
  240. /package/dist/packages/{extension-sdk/src/lifecycle.js → protocol/src/commands.js} +0 -0
  241. /package/dist/packages/{extension-sdk/src/tools.js → protocol/src/flags.js} +0 -0
@@ -1,375 +1,375 @@
1
- # Trustpilot — Company Reviews Scraping
2
-
3
- Field-tested against trustpilot.com on 2026-04-18.
4
- `http_get` with a generic Mozilla/5.0 UA works — no JS challenge, no Cloudflare block.
5
- The Trustpilot Consumer API (`api.trustpilot.com`) returns 403 for all endpoints without an API key.
6
-
7
- ---
8
-
9
- ## Fastest Approach: `http_get` + `__NEXT_DATA__`
10
-
11
- Trustpilot is a Next.js SSR app. Every company review page embeds the full data payload in a
12
- `<script id="__NEXT_DATA__">` JSON block — no browser needed. This includes the business unit
13
- metadata, all 20 reviews for the current page, pagination info, and rating distribution.
14
-
15
- ```python
16
- import re, json
17
- from helpers import http_get
18
-
19
- def get_trustpilot_page(domain, page=1, stars=None, languages='en', verified=False):
20
- """
21
- Fetch one page of reviews for a company domain.
22
- Returns (business_unit, reviews, pagination, rating_distribution).
23
- Returns (None, [], {}, {}) if page is beyond the cap or no data.
24
- """
25
- url = f"https://www.trustpilot.com/review/{domain}?languages={languages}&page={page}"
26
- if stars:
27
- url += f"&stars={stars}"
28
- if verified:
29
- url += "&verified=true"
30
-
31
- html = http_get(url)
32
- m = re.search(
33
- r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
34
- html, re.DOTALL
35
- )
36
- if not m:
37
- return None, [], {}, {}
38
-
39
- data = json.loads(m.group(1))
40
- pp = data['props']['pageProps']
41
- bu = pp['businessUnit']
42
- filters = pp.get('filters') or {}
43
- pagination = filters.get('pagination', {})
44
- ratings = filters.get('reviewStatistics', {}).get('ratings', {})
45
- reviews = pp.get('reviews', [])
46
-
47
- return bu, reviews, pagination, ratings
48
- ```
49
-
50
- ---
51
-
52
- ## Business Unit (Company) Metadata
53
-
54
- ```python
55
- bu, reviews, pagination, ratings = get_trustpilot_page("amazon.com")
56
-
57
- # Confirmed fields (tested 2026-04-18):
58
- bu['id'] # '46ad346800006400050092d0' — stable MongoDB ObjectId
59
- bu['displayName'] # 'Amazon'
60
- bu['identifyingName'] # 'www.amazon.com'
61
- bu['trustScore'] # 1.7 (float, 1.0–5.0)
62
- bu['stars'] # 1.5 (display stars: 1, 1.5, 2, 2.5 … 5)
63
- bu['numberOfReviews'] # 45228 — total across all languages
64
- bu['websiteUrl'] # 'https://www.amazon.com'
65
- bu['isClaimed'] # True/False
66
- bu['isClosed'] # True/False
67
- bu['isCollectingReviews'] # True/False
68
-
69
- # Rating distribution (from filters.reviewStatistics.ratings):
70
- ratings # {'total': 45228, 'one': 29718, 'two': 2701, 'three': 1759, 'four': 2367, 'five': 8683}
71
-
72
- # Pagination (filtered count, default is English only):
73
- pagination # {'currentPage': 1, 'perPage': 20, 'totalCount': 28039, 'totalPages': 1402}
74
- ```
75
-
76
- ---
77
-
78
- ## Review Fields
79
-
80
- Each review in the `reviews` list has these confirmed fields:
81
-
82
- ```python
83
- review = {
84
- 'id': '69e3103e09f46d6b5910f3c1', # hex ObjectId, unique
85
- 'rating': 1, # int 1–5
86
- 'title': 'UNDELIVERABLE',
87
- 'text': 'UNDELIVERABLE\nThis is the only explanation...',
88
- 'language': 'en',
89
- 'likes': 0, # upvote count
90
- 'source': 'Organic', # 'Organic' or 'Invitation'
91
- 'filtered': False,
92
- 'isPending': False,
93
-
94
- 'dates': {
95
- 'experiencedDate': '2026-03-29T00:00:00.000Z', # when they used the service
96
- 'publishedDate': '2026-04-18T07:01:50.000Z', # when review was posted
97
- 'updatedDate': None,
98
- 'submittedDate': None,
99
- },
100
-
101
- 'consumer': {
102
- 'id': '5cafe2feb158a8533b443467',
103
- 'displayName': 'Baldy Bloke',
104
- 'imageUrl': 'https://user-images.trustpilot.com/...',
105
- 'numberOfReviews': 17,
106
- 'countryCode': 'GB',
107
- 'hasImage': True,
108
- 'isVerified': False,
109
- },
110
-
111
- 'labels': {
112
- 'verification': {
113
- 'isVerified': False,
114
- 'verificationLevel': 'not-verified', # or 'verified'
115
- 'reviewSourceName': 'Organic',
116
- 'verificationSource': 'invitation',
117
- 'createdDateTime': '2026-04-18T07:01:50.000Z',
118
- 'hasDachExclusion': False,
119
- },
120
- 'merged': None,
121
- },
122
-
123
- 'reply': None, # or {'message': '...', 'publishedDate': '...', 'updatedDate': None}
124
- 'location': None, # populated for multi-location businesses
125
- 'productReviews': [], # non-empty for product-level reviews
126
- }
127
- ```
128
-
129
- ---
130
-
131
- ## Paginating — Collect Up to 200 Reviews
132
-
133
- **Hard cap: pages 1–10 work; page 11+ returns an empty `reviews` array (no error, just empty).**
134
- This cap applies per filter combination, so `stars=1` gives 200 reviews, `stars=2` gives another
135
- 200, etc.
136
-
137
- ```python
138
- import re, json, time
139
- from helpers import http_get
140
-
141
- def collect_reviews(domain, stars=None, languages='en', max_pages=10, delay=0.5):
142
- """
143
- Collect up to max_pages*20 = 200 reviews. Returns list of review dicts.
144
- stars: 1-5 to filter by rating (None = all)
145
- languages: 'en' (default), 'all', or ISO code like 'de'
146
- delay: seconds between requests (0.5 is safe; tested 5 rapid reqs with no block)
147
- """
148
- base = f"https://www.trustpilot.com/review/{domain}"
149
- params = f"?languages={languages}"
150
- if stars:
151
- params += f"&stars={stars}"
152
-
153
- all_reviews = []
154
- seen_ids = set()
155
-
156
- for page in range(1, max_pages + 1):
157
- url = f"{base}{params}&page={page}"
158
- html = http_get(url)
159
- m = re.search(
160
- r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
161
- html, re.DOTALL
162
- )
163
- if not m:
164
- break
165
- data = json.loads(m.group(1))
166
- reviews = data['props']['pageProps'].get('reviews', [])
167
- if not reviews:
168
- break # hit the page 10 cap or truly no more reviews
169
-
170
- new = [r for r in reviews if r['id'] not in seen_ids]
171
- seen_ids.update(r['id'] for r in reviews)
172
- all_reviews.extend(new)
173
-
174
- if page < max_pages:
175
- time.sleep(delay)
176
-
177
- return all_reviews
178
-
179
-
180
- # Usage — 200 reviews per call:
181
- reviews = collect_reviews("shopify.com") # English only, all ratings
182
- reviews_1star = collect_reviews("amazon.com", stars=1) # 200 x 1-star reviews
183
- reviews_all = collect_reviews("stripe.com", languages='all') # all languages
184
- ```
185
-
186
- ### Maximize unique reviews by sweeping all star ratings
187
-
188
- Since each star filter gives an independent 200-review window, you can collect up to 1,000
189
- reviews per company (pages are deduplicated across filters):
190
-
191
- ```python
192
- all_reviews = {}
193
- for stars in range(1, 6):
194
- for r in collect_reviews("amazon.com", stars=stars, delay=0.5):
195
- all_reviews[r['id']] = r
196
-
197
- print(f"Total unique reviews: {len(all_reviews)}")
198
- ```
199
-
200
- ---
201
-
202
- ## Filters Reference
203
-
204
- All filter params are appended to the base URL `https://www.trustpilot.com/review/{domain}`:
205
-
206
- | Param | Values | Notes |
207
- |---|---|---|
208
- | `page` | 1–10 | Page 11+ returns empty `reviews` (tested). 20 reviews per page. |
209
- | `languages` | `en`, `all`, `de`, `fr`, `it`, `nl`, `sv`, `da`… | Default is `en`. Use `all` for all languages. |
210
- | `stars` | `1`, `2`, `3`, `4`, `5` | Filter to that star rating only. Works correctly. |
211
- | `verified` | `true` | Returns only invitation-verified reviews. Amazon has only ~21 verified reviews total. |
212
- | `date` | `last30days`, `last6months`, `last12months` | Reflected in `filters.selected.date` but data volume unchanged vs no filter — server-side filtering may be best-effort. |
213
- | `sort` | `recency`, `highest_rated`, `lowest_rated`, `helpful` | The `sort` param is accepted but **ignored server-side** via SSR — `filters.selected.sort` always returns `recency`. Sort only works in browser JS navigation. |
214
-
215
- ---
216
-
217
- ## Pagination Object
218
-
219
- ```python
220
- # From filters.pagination (present on pages 1–10 when data exists):
221
- pagination = {
222
- 'currentPage': 1,
223
- 'perPage': 20,
224
- 'totalCount': 28039, # filtered count (e.g. English only)
225
- 'totalPages': 1402, # math: ceil(totalCount / 20)
226
- }
227
-
228
- # NOTE: totalPages can be 1402 but you can only access pages 1–10 (200 reviews).
229
- # On page 11+ the reviews list is empty and pagination is absent.
230
- ```
231
-
232
- ---
233
-
234
- ## Rate Limits and Anti-bot
235
-
236
- - **No Cloudflare, no DataDome** — plain HTTP with `Mozilla/5.0` UA works immediately (tested
237
- 5 rapid requests in <5 seconds without any block).
238
- - **No CAPTCHA** observed during any test run.
239
- - **No 429 / rate-limit headers** seen on rapid sequential requests.
240
- - Safe rate: 0.5s between requests is conservative. Tested 5 consecutive requests at natural
241
- speed (0.2–1s each) with no issue.
242
- - **robots.txt** has `User-agent: * / Disallow: /` (all paths blocked for unnamed bots) and
243
- explicitly blocks `anthropic-ai`, `ClaudeBot`, `Claude-User`, `Claude-SearchBot`, `GPTBot`,
244
- `anthropic-ai`, `CCBot`, etc. Despite this, `http_get` with `Mozilla/5.0` UA is not blocked
245
- server-side (robots.txt is advisory only). Respect the policy if operating at scale.
246
-
247
- ---
248
-
249
- ## Consumer API (`api.trustpilot.com`)
250
-
251
- All Consumer API endpoints require an API key (OAuth2 client credentials). Without a key:
252
-
253
- ```
254
- GET https://api.trustpilot.com/v1/business-units/find?name=amazon.com → 403 Forbidden
255
- GET https://api.trustpilot.com/v1/business-units/{id}/reviews → 403 Forbidden
256
- ```
257
-
258
- The Business Unit ID embedded in `__NEXT_DATA__` (`businessUnit.id`) is the same ID used in the
259
- Consumer API, so if you have an API key, you can use it directly without a separate lookup.
260
-
261
- ---
262
-
263
- ## Gotchas
264
-
265
- 1. **Page cap is 10, not `totalPages`**: `filters.pagination.totalPages` may show 1402, but
266
- requests for pages 11+ return `reviews: []` silently. The server-rendered SSR cap is
267
- hard-coded at page 10 (200 reviews).
268
-
269
- 2. **`totalCount` in pagination is language-filtered**: With `languages=en`, `totalCount` is the
270
- English-only count (e.g. 28,039 for Amazon). `businessUnit.numberOfReviews` is the true total
271
- across all languages (45,228). Use `languages=all` to see the full count in pagination.
272
-
273
- 3. **Sort param ignored in SSR**: `?sort=highest_rated` is reflected in `filters.selected.sort`
274
- in the JSON but the reviews returned are always `recency`-sorted. Sort only takes effect
275
- via browser-side JS navigation.
276
-
277
- 4. **Verified filter is narrow**: Amazon has 45,228 reviews but only 21 are `isVerified=True`
278
- (verificationLevel = 'verified'). Most reviews are organic/not-verified. Page 1 of
279
- `verified=true` shows a misleading `totalCount=28039` — page 2 corrects to `totalCount=21`.
280
-
281
- 5. **`date` filter behavior**: The `date` param is reflected in `filters.selected.date` but the
282
- total review counts and returned reviews do not visibly change vs no filter in testing. The
283
- server may apply it only partially or it may affect ordering rather than filtering.
284
-
285
- 6. **`languages=en` is the default** and the server returns it even without the param. Use
286
- `languages=all` explicitly to get reviews in all languages.
287
-
288
- 7. **No `__NEXT_DATA__` fallback**: Never observed an empty or missing `__NEXT_DATA__` on valid
289
- company pages. If absent, the domain may not have a Trustpilot profile — check for a
290
- redirect or 404 in the HTML title.
291
-
292
- 8. **Stars `1.5` vs `2`**: `businessUnit.stars` uses half-star display values (1.5, 2.0, etc).
293
- `businessUnit.trustScore` is the precise float (1.7). Use `trustScore` for numeric comparison.
294
-
295
- ---
296
-
297
- ## Complete One-Shot Example
298
-
299
- ```python
300
- import re, json, time
301
- from helpers import http_get
302
-
303
- def scrape_trustpilot(domain, max_unique=200):
304
- """
305
- Scrape up to max_unique reviews. Returns (company_info, reviews_list).
306
- With max_unique=1000, sweeps all 5 star ratings to maximize coverage.
307
- """
308
- def _fetch_page(domain, page, stars=None, languages='en'):
309
- url = f"https://www.trustpilot.com/review/{domain}?languages={languages}&page={page}"
310
- if stars:
311
- url += f"&stars={stars}"
312
- html = http_get(url)
313
- m = re.search(
314
- r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
315
- html, re.DOTALL
316
- )
317
- if not m:
318
- return None, []
319
- d = json.loads(m.group(1))
320
- pp = d['props']['pageProps']
321
- return pp['businessUnit'], pp.get('reviews', [])
322
-
323
- company_info = None
324
- all_reviews = {}
325
-
326
- # First page to get company info
327
- bu, reviews = _fetch_page(domain, 1)
328
- company_info = {
329
- 'id': bu['id'],
330
- 'name': bu['displayName'],
331
- 'domain': bu['identifyingName'],
332
- 'trust_score': bu['trustScore'],
333
- 'stars': bu['stars'],
334
- 'total_reviews': bu['numberOfReviews'],
335
- 'is_claimed': bu['isClaimed'],
336
- }
337
- for r in reviews:
338
- all_reviews[r['id']] = r
339
-
340
- if max_unique <= 20:
341
- return company_info, list(all_reviews.values())
342
-
343
- # Pages 2–10 (no star filter)
344
- for page in range(2, 11):
345
- if len(all_reviews) >= max_unique:
346
- break
347
- _, reviews = _fetch_page(domain, page)
348
- if not reviews:
349
- break
350
- for r in reviews:
351
- all_reviews[r['id']] = r
352
- time.sleep(0.5)
353
-
354
- # If we want more, sweep by star rating
355
- if len(all_reviews) < max_unique and max_unique > 200:
356
- for stars in range(1, 6):
357
- for page in range(1, 11):
358
- if len(all_reviews) >= max_unique:
359
- break
360
- _, reviews = _fetch_page(domain, page, stars=stars)
361
- if not reviews:
362
- break
363
- for r in reviews:
364
- all_reviews[r['id']] = r
365
- time.sleep(0.5)
366
-
367
- return company_info, list(all_reviews.values())[:max_unique]
368
-
369
-
370
- # Run it:
371
- company, reviews = scrape_trustpilot("shopify.com", max_unique=200)
372
- print(f"{company['name']} — TrustScore {company['trust_score']} — {company['total_reviews']} total reviews")
373
- print(f"Collected: {len(reviews)} reviews")
374
- print(f"Sample: [{reviews[0]['rating']}★] {reviews[0]['title'][:60]}")
375
- ```
1
+ # Trustpilot — Company Reviews Scraping
2
+
3
+ Field-tested against trustpilot.com on 2026-04-18.
4
+ `http_get` with a generic Mozilla/5.0 UA works — no JS challenge, no Cloudflare block.
5
+ The Trustpilot Consumer API (`api.trustpilot.com`) returns 403 for all endpoints without an API key.
6
+
7
+ ---
8
+
9
+ ## Fastest Approach: `http_get` + `__NEXT_DATA__`
10
+
11
+ Trustpilot is a Next.js SSR app. Every company review page embeds the full data payload in a
12
+ `<script id="__NEXT_DATA__">` JSON block — no browser needed. This includes the business unit
13
+ metadata, all 20 reviews for the current page, pagination info, and rating distribution.
14
+
15
+ ```python
16
+ import re, json
17
+ from helpers import http_get
18
+
19
+ def get_trustpilot_page(domain, page=1, stars=None, languages='en', verified=False):
20
+ """
21
+ Fetch one page of reviews for a company domain.
22
+ Returns (business_unit, reviews, pagination, rating_distribution).
23
+ Returns (None, [], {}, {}) if page is beyond the cap or no data.
24
+ """
25
+ url = f"https://www.trustpilot.com/review/{domain}?languages={languages}&page={page}"
26
+ if stars:
27
+ url += f"&stars={stars}"
28
+ if verified:
29
+ url += "&verified=true"
30
+
31
+ html = http_get(url)
32
+ m = re.search(
33
+ r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
34
+ html, re.DOTALL
35
+ )
36
+ if not m:
37
+ return None, [], {}, {}
38
+
39
+ data = json.loads(m.group(1))
40
+ pp = data['props']['pageProps']
41
+ bu = pp['businessUnit']
42
+ filters = pp.get('filters') or {}
43
+ pagination = filters.get('pagination', {})
44
+ ratings = filters.get('reviewStatistics', {}).get('ratings', {})
45
+ reviews = pp.get('reviews', [])
46
+
47
+ return bu, reviews, pagination, ratings
48
+ ```
49
+
50
+ ---
51
+
52
+ ## Business Unit (Company) Metadata
53
+
54
+ ```python
55
+ bu, reviews, pagination, ratings = get_trustpilot_page("amazon.com")
56
+
57
+ # Confirmed fields (tested 2026-04-18):
58
+ bu['id'] # '46ad346800006400050092d0' — stable MongoDB ObjectId
59
+ bu['displayName'] # 'Amazon'
60
+ bu['identifyingName'] # 'www.amazon.com'
61
+ bu['trustScore'] # 1.7 (float, 1.0–5.0)
62
+ bu['stars'] # 1.5 (display stars: 1, 1.5, 2, 2.5 … 5)
63
+ bu['numberOfReviews'] # 45228 — total across all languages
64
+ bu['websiteUrl'] # 'https://www.amazon.com'
65
+ bu['isClaimed'] # True/False
66
+ bu['isClosed'] # True/False
67
+ bu['isCollectingReviews'] # True/False
68
+
69
+ # Rating distribution (from filters.reviewStatistics.ratings):
70
+ ratings # {'total': 45228, 'one': 29718, 'two': 2701, 'three': 1759, 'four': 2367, 'five': 8683}
71
+
72
+ # Pagination (filtered count, default is English only):
73
+ pagination # {'currentPage': 1, 'perPage': 20, 'totalCount': 28039, 'totalPages': 1402}
74
+ ```
75
+
76
+ ---
77
+
78
+ ## Review Fields
79
+
80
+ Each review in the `reviews` list has these confirmed fields:
81
+
82
+ ```python
83
+ review = {
84
+ 'id': '69e3103e09f46d6b5910f3c1', # hex ObjectId, unique
85
+ 'rating': 1, # int 1–5
86
+ 'title': 'UNDELIVERABLE',
87
+ 'text': 'UNDELIVERABLE\nThis is the only explanation...',
88
+ 'language': 'en',
89
+ 'likes': 0, # upvote count
90
+ 'source': 'Organic', # 'Organic' or 'Invitation'
91
+ 'filtered': False,
92
+ 'isPending': False,
93
+
94
+ 'dates': {
95
+ 'experiencedDate': '2026-03-29T00:00:00.000Z', # when they used the service
96
+ 'publishedDate': '2026-04-18T07:01:50.000Z', # when review was posted
97
+ 'updatedDate': None,
98
+ 'submittedDate': None,
99
+ },
100
+
101
+ 'consumer': {
102
+ 'id': '5cafe2feb158a8533b443467',
103
+ 'displayName': 'Baldy Bloke',
104
+ 'imageUrl': 'https://user-images.trustpilot.com/...',
105
+ 'numberOfReviews': 17,
106
+ 'countryCode': 'GB',
107
+ 'hasImage': True,
108
+ 'isVerified': False,
109
+ },
110
+
111
+ 'labels': {
112
+ 'verification': {
113
+ 'isVerified': False,
114
+ 'verificationLevel': 'not-verified', # or 'verified'
115
+ 'reviewSourceName': 'Organic',
116
+ 'verificationSource': 'invitation',
117
+ 'createdDateTime': '2026-04-18T07:01:50.000Z',
118
+ 'hasDachExclusion': False,
119
+ },
120
+ 'merged': None,
121
+ },
122
+
123
+ 'reply': None, # or {'message': '...', 'publishedDate': '...', 'updatedDate': None}
124
+ 'location': None, # populated for multi-location businesses
125
+ 'productReviews': [], # non-empty for product-level reviews
126
+ }
127
+ ```
128
+
129
+ ---
130
+
131
+ ## Paginating — Collect Up to 200 Reviews
132
+
133
+ **Hard cap: pages 1–10 work; page 11+ returns an empty `reviews` array (no error, just empty).**
134
+ This cap applies per filter combination, so `stars=1` gives 200 reviews, `stars=2` gives another
135
+ 200, etc.
136
+
137
+ ```python
138
+ import re, json, time
139
+ from helpers import http_get
140
+
141
+ def collect_reviews(domain, stars=None, languages='en', max_pages=10, delay=0.5):
142
+ """
143
+ Collect up to max_pages*20 = 200 reviews. Returns list of review dicts.
144
+ stars: 1-5 to filter by rating (None = all)
145
+ languages: 'en' (default), 'all', or ISO code like 'de'
146
+ delay: seconds between requests (0.5 is safe; tested 5 rapid reqs with no block)
147
+ """
148
+ base = f"https://www.trustpilot.com/review/{domain}"
149
+ params = f"?languages={languages}"
150
+ if stars:
151
+ params += f"&stars={stars}"
152
+
153
+ all_reviews = []
154
+ seen_ids = set()
155
+
156
+ for page in range(1, max_pages + 1):
157
+ url = f"{base}{params}&page={page}"
158
+ html = http_get(url)
159
+ m = re.search(
160
+ r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
161
+ html, re.DOTALL
162
+ )
163
+ if not m:
164
+ break
165
+ data = json.loads(m.group(1))
166
+ reviews = data['props']['pageProps'].get('reviews', [])
167
+ if not reviews:
168
+ break # hit the page 10 cap or truly no more reviews
169
+
170
+ new = [r for r in reviews if r['id'] not in seen_ids]
171
+ seen_ids.update(r['id'] for r in reviews)
172
+ all_reviews.extend(new)
173
+
174
+ if page < max_pages:
175
+ time.sleep(delay)
176
+
177
+ return all_reviews
178
+
179
+
180
+ # Usage — 200 reviews per call:
181
+ reviews = collect_reviews("shopify.com") # English only, all ratings
182
+ reviews_1star = collect_reviews("amazon.com", stars=1) # 200 x 1-star reviews
183
+ reviews_all = collect_reviews("stripe.com", languages='all') # all languages
184
+ ```
185
+
186
+ ### Maximize unique reviews by sweeping all star ratings
187
+
188
+ Since each star filter gives an independent 200-review window, you can collect up to 1,000
189
+ reviews per company (pages are deduplicated across filters):
190
+
191
+ ```python
192
+ all_reviews = {}
193
+ for stars in range(1, 6):
194
+ for r in collect_reviews("amazon.com", stars=stars, delay=0.5):
195
+ all_reviews[r['id']] = r
196
+
197
+ print(f"Total unique reviews: {len(all_reviews)}")
198
+ ```
199
+
200
+ ---
201
+
202
+ ## Filters Reference
203
+
204
+ All filter params are appended to the base URL `https://www.trustpilot.com/review/{domain}`:
205
+
206
+ | Param | Values | Notes |
207
+ |---|---|---|
208
+ | `page` | 1–10 | Page 11+ returns empty `reviews` (tested). 20 reviews per page. |
209
+ | `languages` | `en`, `all`, `de`, `fr`, `it`, `nl`, `sv`, `da`… | Default is `en`. Use `all` for all languages. |
210
+ | `stars` | `1`, `2`, `3`, `4`, `5` | Filter to that star rating only. Works correctly. |
211
+ | `verified` | `true` | Returns only invitation-verified reviews. Amazon has only ~21 verified reviews total. |
212
+ | `date` | `last30days`, `last6months`, `last12months` | Reflected in `filters.selected.date` but data volume unchanged vs no filter — server-side filtering may be best-effort. |
213
+ | `sort` | `recency`, `highest_rated`, `lowest_rated`, `helpful` | The `sort` param is accepted but **ignored server-side** via SSR — `filters.selected.sort` always returns `recency`. Sort only works in browser JS navigation. |
214
+
215
+ ---
216
+
217
+ ## Pagination Object
218
+
219
+ ```python
220
+ # From filters.pagination (present on pages 1–10 when data exists):
221
+ pagination = {
222
+ 'currentPage': 1,
223
+ 'perPage': 20,
224
+ 'totalCount': 28039, # filtered count (e.g. English only)
225
+ 'totalPages': 1402, # math: ceil(totalCount / 20)
226
+ }
227
+
228
+ # NOTE: totalPages can be 1402 but you can only access pages 1–10 (200 reviews).
229
+ # On page 11+ the reviews list is empty and pagination is absent.
230
+ ```
231
+
232
+ ---
233
+
234
+ ## Rate Limits and Anti-bot
235
+
236
+ - **No Cloudflare, no DataDome** — plain HTTP with `Mozilla/5.0` UA works immediately (tested
237
+ 5 rapid requests in <5 seconds without any block).
238
+ - **No CAPTCHA** observed during any test run.
239
+ - **No 429 / rate-limit headers** seen on rapid sequential requests.
240
+ - Safe rate: 0.5s between requests is conservative. Tested 5 consecutive requests at natural
241
+ speed (0.2–1s each) with no issue.
242
+ - **robots.txt** has `User-agent: * / Disallow: /` (all paths blocked for unnamed bots) and
243
+ explicitly blocks `anthropic-ai`, `ClaudeBot`, `Claude-User`, `Claude-SearchBot`, `GPTBot`,
244
+ `anthropic-ai`, `CCBot`, etc. Despite this, `http_get` with `Mozilla/5.0` UA is not blocked
245
+ server-side (robots.txt is advisory only). Respect the policy if operating at scale.
246
+
247
+ ---
248
+
249
+ ## Consumer API (`api.trustpilot.com`)
250
+
251
+ All Consumer API endpoints require an API key (OAuth2 client credentials). Without a key:
252
+
253
+ ```
254
+ GET https://api.trustpilot.com/v1/business-units/find?name=amazon.com → 403 Forbidden
255
+ GET https://api.trustpilot.com/v1/business-units/{id}/reviews → 403 Forbidden
256
+ ```
257
+
258
+ The Business Unit ID embedded in `__NEXT_DATA__` (`businessUnit.id`) is the same ID used in the
259
+ Consumer API, so if you have an API key, you can use it directly without a separate lookup.
260
+
261
+ ---
262
+
263
+ ## Gotchas
264
+
265
+ 1. **Page cap is 10, not `totalPages`**: `filters.pagination.totalPages` may show 1402, but
266
+ requests for pages 11+ return `reviews: []` silently. The server-rendered SSR cap is
267
+ hard-coded at page 10 (200 reviews).
268
+
269
+ 2. **`totalCount` in pagination is language-filtered**: With `languages=en`, `totalCount` is the
270
+ English-only count (e.g. 28,039 for Amazon). `businessUnit.numberOfReviews` is the true total
271
+ across all languages (45,228). Use `languages=all` to see the full count in pagination.
272
+
273
+ 3. **Sort param ignored in SSR**: `?sort=highest_rated` is reflected in `filters.selected.sort`
274
+ in the JSON but the reviews returned are always `recency`-sorted. Sort only takes effect
275
+ via browser-side JS navigation.
276
+
277
+ 4. **Verified filter is narrow**: Amazon has 45,228 reviews but only 21 are `isVerified=True`
278
+ (verificationLevel = 'verified'). Most reviews are organic/not-verified. Page 1 of
279
+ `verified=true` shows a misleading `totalCount=28039` — page 2 corrects to `totalCount=21`.
280
+
281
+ 5. **`date` filter behavior**: The `date` param is reflected in `filters.selected.date` but the
282
+ total review counts and returned reviews do not visibly change vs no filter in testing. The
283
+ server may apply it only partially or it may affect ordering rather than filtering.
284
+
285
+ 6. **`languages=en` is the default** and the server returns it even without the param. Use
286
+ `languages=all` explicitly to get reviews in all languages.
287
+
288
+ 7. **No `__NEXT_DATA__` fallback**: Never observed an empty or missing `__NEXT_DATA__` on valid
289
+ company pages. If absent, the domain may not have a Trustpilot profile — check for a
290
+ redirect or 404 in the HTML title.
291
+
292
+ 8. **Stars `1.5` vs `2`**: `businessUnit.stars` uses half-star display values (1.5, 2.0, etc).
293
+ `businessUnit.trustScore` is the precise float (1.7). Use `trustScore` for numeric comparison.
294
+
295
+ ---
296
+
297
+ ## Complete One-Shot Example
298
+
299
+ ```python
300
+ import re, json, time
301
+ from helpers import http_get
302
+
303
+ def scrape_trustpilot(domain, max_unique=200):
304
+ """
305
+ Scrape up to max_unique reviews. Returns (company_info, reviews_list).
306
+ With max_unique=1000, sweeps all 5 star ratings to maximize coverage.
307
+ """
308
+ def _fetch_page(domain, page, stars=None, languages='en'):
309
+ url = f"https://www.trustpilot.com/review/{domain}?languages={languages}&page={page}"
310
+ if stars:
311
+ url += f"&stars={stars}"
312
+ html = http_get(url)
313
+ m = re.search(
314
+ r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
315
+ html, re.DOTALL
316
+ )
317
+ if not m:
318
+ return None, []
319
+ d = json.loads(m.group(1))
320
+ pp = d['props']['pageProps']
321
+ return pp['businessUnit'], pp.get('reviews', [])
322
+
323
+ company_info = None
324
+ all_reviews = {}
325
+
326
+ # First page to get company info
327
+ bu, reviews = _fetch_page(domain, 1)
328
+ company_info = {
329
+ 'id': bu['id'],
330
+ 'name': bu['displayName'],
331
+ 'domain': bu['identifyingName'],
332
+ 'trust_score': bu['trustScore'],
333
+ 'stars': bu['stars'],
334
+ 'total_reviews': bu['numberOfReviews'],
335
+ 'is_claimed': bu['isClaimed'],
336
+ }
337
+ for r in reviews:
338
+ all_reviews[r['id']] = r
339
+
340
+ if max_unique <= 20:
341
+ return company_info, list(all_reviews.values())
342
+
343
+ # Pages 2–10 (no star filter)
344
+ for page in range(2, 11):
345
+ if len(all_reviews) >= max_unique:
346
+ break
347
+ _, reviews = _fetch_page(domain, page)
348
+ if not reviews:
349
+ break
350
+ for r in reviews:
351
+ all_reviews[r['id']] = r
352
+ time.sleep(0.5)
353
+
354
+ # If we want more, sweep by star rating
355
+ if len(all_reviews) < max_unique and max_unique > 200:
356
+ for stars in range(1, 6):
357
+ for page in range(1, 11):
358
+ if len(all_reviews) >= max_unique:
359
+ break
360
+ _, reviews = _fetch_page(domain, page, stars=stars)
361
+ if not reviews:
362
+ break
363
+ for r in reviews:
364
+ all_reviews[r['id']] = r
365
+ time.sleep(0.5)
366
+
367
+ return company_info, list(all_reviews.values())[:max_unique]
368
+
369
+
370
+ # Run it:
371
+ company, reviews = scrape_trustpilot("shopify.com", max_unique=200)
372
+ print(f"{company['name']} — TrustScore {company['trust_score']} — {company['total_reviews']} total reviews")
373
+ print(f"Collected: {len(reviews)} reviews")
374
+ print(f"Sample: [{reviews[0]['rating']}★] {reviews[0]['title'][:60]}")
375
+ ```