flowtask 5.8.4__cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (470) hide show
  1. flowtask/__init__.py +93 -0
  2. flowtask/__main__.py +38 -0
  3. flowtask/bots/__init__.py +6 -0
  4. flowtask/bots/check.py +93 -0
  5. flowtask/bots/codebot.py +51 -0
  6. flowtask/components/ASPX.py +148 -0
  7. flowtask/components/AddDataset.py +352 -0
  8. flowtask/components/Amazon.py +523 -0
  9. flowtask/components/AutoTask.py +314 -0
  10. flowtask/components/Azure.py +80 -0
  11. flowtask/components/AzureUsers.py +106 -0
  12. flowtask/components/BaseAction.py +91 -0
  13. flowtask/components/BaseLoop.py +198 -0
  14. flowtask/components/BestBuy.py +800 -0
  15. flowtask/components/CSVToGCS.py +120 -0
  16. flowtask/components/CompanyScraper/__init__.py +1 -0
  17. flowtask/components/CompanyScraper/parsers/__init__.py +6 -0
  18. flowtask/components/CompanyScraper/parsers/base.py +102 -0
  19. flowtask/components/CompanyScraper/parsers/explorium.py +192 -0
  20. flowtask/components/CompanyScraper/parsers/leadiq.py +206 -0
  21. flowtask/components/CompanyScraper/parsers/rocket.py +133 -0
  22. flowtask/components/CompanyScraper/parsers/siccode.py +109 -0
  23. flowtask/components/CompanyScraper/parsers/visualvisitor.py +130 -0
  24. flowtask/components/CompanyScraper/parsers/zoominfo.py +118 -0
  25. flowtask/components/CompanyScraper/scrapper.py +1054 -0
  26. flowtask/components/CopyTo.py +177 -0
  27. flowtask/components/CopyToBigQuery.py +243 -0
  28. flowtask/components/CopyToMongoDB.py +291 -0
  29. flowtask/components/CopyToPg.py +609 -0
  30. flowtask/components/CopyToRethink.py +207 -0
  31. flowtask/components/CreateGCSBucket.py +102 -0
  32. flowtask/components/CreateReport/CreateReport.py +228 -0
  33. flowtask/components/CreateReport/__init__.py +9 -0
  34. flowtask/components/CreateReport/charts/__init__.py +15 -0
  35. flowtask/components/CreateReport/charts/bar.py +51 -0
  36. flowtask/components/CreateReport/charts/base.py +66 -0
  37. flowtask/components/CreateReport/charts/pie.py +64 -0
  38. flowtask/components/CreateReport/utils.py +9 -0
  39. flowtask/components/CustomerSatisfaction.py +196 -0
  40. flowtask/components/DataInput.py +200 -0
  41. flowtask/components/DateList.py +255 -0
  42. flowtask/components/DbClient.py +163 -0
  43. flowtask/components/DialPad.py +146 -0
  44. flowtask/components/DocumentDBQuery.py +200 -0
  45. flowtask/components/DownloadFrom.py +371 -0
  46. flowtask/components/DownloadFromD2L.py +113 -0
  47. flowtask/components/DownloadFromFTP.py +181 -0
  48. flowtask/components/DownloadFromIMAP.py +315 -0
  49. flowtask/components/DownloadFromS3.py +198 -0
  50. flowtask/components/DownloadFromSFTP.py +265 -0
  51. flowtask/components/DownloadFromSharepoint.py +110 -0
  52. flowtask/components/DownloadFromSmartSheet.py +114 -0
  53. flowtask/components/DownloadS3File.py +229 -0
  54. flowtask/components/Dummy.py +59 -0
  55. flowtask/components/DuplicatePhoto.py +411 -0
  56. flowtask/components/EmployeeEvaluation.py +237 -0
  57. flowtask/components/ExecuteSQL.py +323 -0
  58. flowtask/components/ExtractHTML.py +178 -0
  59. flowtask/components/FileBase.py +178 -0
  60. flowtask/components/FileCopy.py +181 -0
  61. flowtask/components/FileDelete.py +82 -0
  62. flowtask/components/FileExists.py +146 -0
  63. flowtask/components/FileIteratorDelete.py +112 -0
  64. flowtask/components/FileList.py +194 -0
  65. flowtask/components/FileOpen.py +75 -0
  66. flowtask/components/FileRead.py +120 -0
  67. flowtask/components/FileRename.py +106 -0
  68. flowtask/components/FilterIf.py +284 -0
  69. flowtask/components/FilterRows/FilterRows.py +200 -0
  70. flowtask/components/FilterRows/__init__.py +10 -0
  71. flowtask/components/FilterRows/functions.py +4 -0
  72. flowtask/components/GCSToBigQuery.py +103 -0
  73. flowtask/components/GoogleA4.py +150 -0
  74. flowtask/components/GoogleGeoCoding.py +344 -0
  75. flowtask/components/GooglePlaces.py +315 -0
  76. flowtask/components/GoogleSearch.py +539 -0
  77. flowtask/components/HTTPClient.py +268 -0
  78. flowtask/components/ICIMS.py +146 -0
  79. flowtask/components/IF.py +179 -0
  80. flowtask/components/IcimsFolderCopy.py +173 -0
  81. flowtask/components/ImageFeatures/__init__.py +5 -0
  82. flowtask/components/ImageFeatures/process.py +233 -0
  83. flowtask/components/IteratorBase.py +251 -0
  84. flowtask/components/LangchainLoader/__init__.py +5 -0
  85. flowtask/components/LangchainLoader/loader.py +194 -0
  86. flowtask/components/LangchainLoader/loaders/__init__.py +22 -0
  87. flowtask/components/LangchainLoader/loaders/abstract.py +362 -0
  88. flowtask/components/LangchainLoader/loaders/basepdf.py +50 -0
  89. flowtask/components/LangchainLoader/loaders/docx.py +91 -0
  90. flowtask/components/LangchainLoader/loaders/html.py +119 -0
  91. flowtask/components/LangchainLoader/loaders/pdfblocks.py +146 -0
  92. flowtask/components/LangchainLoader/loaders/pdfmark.py +79 -0
  93. flowtask/components/LangchainLoader/loaders/pdftables.py +135 -0
  94. flowtask/components/LangchainLoader/loaders/qa.py +67 -0
  95. flowtask/components/LangchainLoader/loaders/txt.py +55 -0
  96. flowtask/components/LeadIQ.py +650 -0
  97. flowtask/components/Loop.py +253 -0
  98. flowtask/components/Lowes.py +334 -0
  99. flowtask/components/MS365Usage.py +156 -0
  100. flowtask/components/MSTeamsMessages.py +320 -0
  101. flowtask/components/MarketClustering.py +1051 -0
  102. flowtask/components/MergeFiles.py +362 -0
  103. flowtask/components/MilvusOutput.py +87 -0
  104. flowtask/components/NearByStores.py +175 -0
  105. flowtask/components/NetworkNinja/__init__.py +6 -0
  106. flowtask/components/NetworkNinja/models/__init__.py +52 -0
  107. flowtask/components/NetworkNinja/models/abstract.py +177 -0
  108. flowtask/components/NetworkNinja/models/account.py +39 -0
  109. flowtask/components/NetworkNinja/models/client.py +19 -0
  110. flowtask/components/NetworkNinja/models/district.py +14 -0
  111. flowtask/components/NetworkNinja/models/events.py +101 -0
  112. flowtask/components/NetworkNinja/models/forms.py +499 -0
  113. flowtask/components/NetworkNinja/models/market.py +16 -0
  114. flowtask/components/NetworkNinja/models/organization.py +34 -0
  115. flowtask/components/NetworkNinja/models/photos.py +125 -0
  116. flowtask/components/NetworkNinja/models/project.py +44 -0
  117. flowtask/components/NetworkNinja/models/region.py +28 -0
  118. flowtask/components/NetworkNinja/models/store.py +203 -0
  119. flowtask/components/NetworkNinja/models/user.py +151 -0
  120. flowtask/components/NetworkNinja/router.py +854 -0
  121. flowtask/components/Odoo.py +175 -0
  122. flowtask/components/OdooInjector.py +192 -0
  123. flowtask/components/OpenFromXML.py +126 -0
  124. flowtask/components/OpenWeather.py +41 -0
  125. flowtask/components/OpenWithBase.py +616 -0
  126. flowtask/components/OpenWithPandas.py +715 -0
  127. flowtask/components/PGPDecrypt.py +199 -0
  128. flowtask/components/PandasIterator.py +187 -0
  129. flowtask/components/PandasToFile.py +189 -0
  130. flowtask/components/Paradox.py +339 -0
  131. flowtask/components/ParamIterator.py +117 -0
  132. flowtask/components/ParseHTML.py +84 -0
  133. flowtask/components/PlacerStores.py +249 -0
  134. flowtask/components/Pokemon.py +507 -0
  135. flowtask/components/PositiveBot.py +62 -0
  136. flowtask/components/PowerPointSlide.py +400 -0
  137. flowtask/components/PrintMessage.py +127 -0
  138. flowtask/components/ProductCompetitors/__init__.py +5 -0
  139. flowtask/components/ProductCompetitors/parsers/__init__.py +7 -0
  140. flowtask/components/ProductCompetitors/parsers/base.py +72 -0
  141. flowtask/components/ProductCompetitors/parsers/bestbuy.py +86 -0
  142. flowtask/components/ProductCompetitors/parsers/lowes.py +103 -0
  143. flowtask/components/ProductCompetitors/scrapper.py +155 -0
  144. flowtask/components/ProductCompliant.py +169 -0
  145. flowtask/components/ProductInfo/__init__.py +1 -0
  146. flowtask/components/ProductInfo/parsers/__init__.py +5 -0
  147. flowtask/components/ProductInfo/parsers/base.py +83 -0
  148. flowtask/components/ProductInfo/parsers/brother.py +97 -0
  149. flowtask/components/ProductInfo/parsers/canon.py +167 -0
  150. flowtask/components/ProductInfo/parsers/epson.py +118 -0
  151. flowtask/components/ProductInfo/parsers/hp.py +131 -0
  152. flowtask/components/ProductInfo/parsers/samsung.py +97 -0
  153. flowtask/components/ProductInfo/scraper.py +319 -0
  154. flowtask/components/ProductPricing.py +118 -0
  155. flowtask/components/QS.py +261 -0
  156. flowtask/components/QSBase.py +201 -0
  157. flowtask/components/QueryIterator.py +273 -0
  158. flowtask/components/QueryToInsert.py +327 -0
  159. flowtask/components/QueryToPandas.py +432 -0
  160. flowtask/components/RESTClient.py +195 -0
  161. flowtask/components/RethinkDBQuery.py +189 -0
  162. flowtask/components/Rsync.py +74 -0
  163. flowtask/components/RunSSH.py +59 -0
  164. flowtask/components/RunShell.py +71 -0
  165. flowtask/components/SalesForce.py +20 -0
  166. flowtask/components/SaveImageBank/__init__.py +257 -0
  167. flowtask/components/SchedulingVisits.py +592 -0
  168. flowtask/components/ScrapPage.py +216 -0
  169. flowtask/components/ScrapSearch.py +79 -0
  170. flowtask/components/SendNotify.py +257 -0
  171. flowtask/components/SentimentAnalysis.py +694 -0
  172. flowtask/components/ServiceScrapper/__init__.py +5 -0
  173. flowtask/components/ServiceScrapper/parsers/__init__.py +1 -0
  174. flowtask/components/ServiceScrapper/parsers/base.py +94 -0
  175. flowtask/components/ServiceScrapper/parsers/costco.py +93 -0
  176. flowtask/components/ServiceScrapper/scrapper.py +199 -0
  177. flowtask/components/SetVariables.py +156 -0
  178. flowtask/components/SubTask.py +182 -0
  179. flowtask/components/SuiteCRM.py +48 -0
  180. flowtask/components/Switch.py +175 -0
  181. flowtask/components/TableBase.py +148 -0
  182. flowtask/components/TableDelete.py +312 -0
  183. flowtask/components/TableInput.py +143 -0
  184. flowtask/components/TableOutput/TableOutput.py +384 -0
  185. flowtask/components/TableOutput/__init__.py +3 -0
  186. flowtask/components/TableSchema.py +534 -0
  187. flowtask/components/Target.py +223 -0
  188. flowtask/components/ThumbnailGenerator.py +156 -0
  189. flowtask/components/ToPandas.py +67 -0
  190. flowtask/components/TransformRows/TransformRows.py +507 -0
  191. flowtask/components/TransformRows/__init__.py +9 -0
  192. flowtask/components/TransformRows/functions.py +559 -0
  193. flowtask/components/TransposeRows.py +176 -0
  194. flowtask/components/UPCDatabase.py +86 -0
  195. flowtask/components/UnGzip.py +171 -0
  196. flowtask/components/Uncompress.py +172 -0
  197. flowtask/components/UniqueRows.py +126 -0
  198. flowtask/components/Unzip.py +107 -0
  199. flowtask/components/UpdateOperationalVars.py +147 -0
  200. flowtask/components/UploadTo.py +299 -0
  201. flowtask/components/UploadToS3.py +136 -0
  202. flowtask/components/UploadToSFTP.py +160 -0
  203. flowtask/components/UploadToSharepoint.py +205 -0
  204. flowtask/components/UserFunc.py +122 -0
  205. flowtask/components/VivaTracker.py +140 -0
  206. flowtask/components/WSDLClient.py +123 -0
  207. flowtask/components/Wait.py +18 -0
  208. flowtask/components/Walmart.py +199 -0
  209. flowtask/components/Workplace.py +134 -0
  210. flowtask/components/XMLToPandas.py +267 -0
  211. flowtask/components/Zammad/__init__.py +41 -0
  212. flowtask/components/Zammad/models.py +0 -0
  213. flowtask/components/ZoomInfoScraper.py +409 -0
  214. flowtask/components/__init__.py +104 -0
  215. flowtask/components/abstract.py +18 -0
  216. flowtask/components/flow.py +530 -0
  217. flowtask/components/google.py +335 -0
  218. flowtask/components/group.py +221 -0
  219. flowtask/components/py.typed +0 -0
  220. flowtask/components/reviewscrap.py +132 -0
  221. flowtask/components/tAutoincrement.py +117 -0
  222. flowtask/components/tConcat.py +109 -0
  223. flowtask/components/tExplode.py +119 -0
  224. flowtask/components/tFilter.py +184 -0
  225. flowtask/components/tGroup.py +236 -0
  226. flowtask/components/tJoin.py +270 -0
  227. flowtask/components/tMap/__init__.py +9 -0
  228. flowtask/components/tMap/functions.py +54 -0
  229. flowtask/components/tMap/tMap.py +450 -0
  230. flowtask/components/tMelt.py +112 -0
  231. flowtask/components/tMerge.py +114 -0
  232. flowtask/components/tOrder.py +93 -0
  233. flowtask/components/tPandas.py +94 -0
  234. flowtask/components/tPivot.py +71 -0
  235. flowtask/components/tPluckCols.py +76 -0
  236. flowtask/components/tUnnest.py +82 -0
  237. flowtask/components/user.py +401 -0
  238. flowtask/conf.py +457 -0
  239. flowtask/download.py +102 -0
  240. flowtask/events/__init__.py +11 -0
  241. flowtask/events/events/__init__.py +20 -0
  242. flowtask/events/events/abstract.py +95 -0
  243. flowtask/events/events/alerts/__init__.py +362 -0
  244. flowtask/events/events/alerts/colfunctions.py +131 -0
  245. flowtask/events/events/alerts/functions.py +158 -0
  246. flowtask/events/events/dummy.py +12 -0
  247. flowtask/events/events/exec.py +124 -0
  248. flowtask/events/events/file/__init__.py +7 -0
  249. flowtask/events/events/file/base.py +51 -0
  250. flowtask/events/events/file/copy.py +23 -0
  251. flowtask/events/events/file/delete.py +16 -0
  252. flowtask/events/events/interfaces/__init__.py +9 -0
  253. flowtask/events/events/interfaces/client.py +67 -0
  254. flowtask/events/events/interfaces/credentials.py +28 -0
  255. flowtask/events/events/interfaces/notifications.py +58 -0
  256. flowtask/events/events/jira.py +122 -0
  257. flowtask/events/events/log.py +26 -0
  258. flowtask/events/events/logerr.py +52 -0
  259. flowtask/events/events/notify.py +59 -0
  260. flowtask/events/events/notify_event.py +160 -0
  261. flowtask/events/events/publish.py +54 -0
  262. flowtask/events/events/sendfile.py +104 -0
  263. flowtask/events/events/task.py +97 -0
  264. flowtask/events/events/teams.py +98 -0
  265. flowtask/events/events/webhook.py +58 -0
  266. flowtask/events/manager.py +287 -0
  267. flowtask/exceptions.c +39393 -0
  268. flowtask/exceptions.cpython-312-x86_64-linux-gnu.so +0 -0
  269. flowtask/extensions/__init__.py +3 -0
  270. flowtask/extensions/abstract.py +82 -0
  271. flowtask/extensions/logging/__init__.py +65 -0
  272. flowtask/hooks/__init__.py +9 -0
  273. flowtask/hooks/actions/__init__.py +22 -0
  274. flowtask/hooks/actions/abstract.py +66 -0
  275. flowtask/hooks/actions/dummy.py +23 -0
  276. flowtask/hooks/actions/jira.py +74 -0
  277. flowtask/hooks/actions/rest.py +320 -0
  278. flowtask/hooks/actions/sampledata.py +37 -0
  279. flowtask/hooks/actions/sensor.py +23 -0
  280. flowtask/hooks/actions/task.py +9 -0
  281. flowtask/hooks/actions/ticket.py +37 -0
  282. flowtask/hooks/actions/zammad.py +55 -0
  283. flowtask/hooks/hook.py +62 -0
  284. flowtask/hooks/models.py +17 -0
  285. flowtask/hooks/service.py +187 -0
  286. flowtask/hooks/step.py +91 -0
  287. flowtask/hooks/types/__init__.py +23 -0
  288. flowtask/hooks/types/base.py +129 -0
  289. flowtask/hooks/types/brokers/__init__.py +11 -0
  290. flowtask/hooks/types/brokers/base.py +54 -0
  291. flowtask/hooks/types/brokers/mqtt.py +35 -0
  292. flowtask/hooks/types/brokers/rabbitmq.py +82 -0
  293. flowtask/hooks/types/brokers/redis.py +83 -0
  294. flowtask/hooks/types/brokers/sqs.py +44 -0
  295. flowtask/hooks/types/fs.py +232 -0
  296. flowtask/hooks/types/http.py +49 -0
  297. flowtask/hooks/types/imap.py +200 -0
  298. flowtask/hooks/types/jira.py +279 -0
  299. flowtask/hooks/types/mail.py +205 -0
  300. flowtask/hooks/types/postgres.py +98 -0
  301. flowtask/hooks/types/responses/__init__.py +8 -0
  302. flowtask/hooks/types/responses/base.py +5 -0
  303. flowtask/hooks/types/sharepoint.py +288 -0
  304. flowtask/hooks/types/ssh.py +141 -0
  305. flowtask/hooks/types/tagged.py +59 -0
  306. flowtask/hooks/types/upload.py +85 -0
  307. flowtask/hooks/types/watch.py +71 -0
  308. flowtask/hooks/types/web.py +36 -0
  309. flowtask/interfaces/AzureClient.py +137 -0
  310. flowtask/interfaces/AzureGraph.py +839 -0
  311. flowtask/interfaces/Boto3Client.py +326 -0
  312. flowtask/interfaces/DropboxClient.py +173 -0
  313. flowtask/interfaces/ExcelHandler.py +94 -0
  314. flowtask/interfaces/FTPClient.py +131 -0
  315. flowtask/interfaces/GoogleCalendar.py +201 -0
  316. flowtask/interfaces/GoogleClient.py +133 -0
  317. flowtask/interfaces/GoogleDrive.py +127 -0
  318. flowtask/interfaces/GoogleGCS.py +89 -0
  319. flowtask/interfaces/GoogleGeocoding.py +93 -0
  320. flowtask/interfaces/GoogleLang.py +114 -0
  321. flowtask/interfaces/GooglePub.py +61 -0
  322. flowtask/interfaces/GoogleSheet.py +68 -0
  323. flowtask/interfaces/IMAPClient.py +137 -0
  324. flowtask/interfaces/O365Calendar.py +113 -0
  325. flowtask/interfaces/O365Client.py +220 -0
  326. flowtask/interfaces/OneDrive.py +284 -0
  327. flowtask/interfaces/Outlook.py +155 -0
  328. flowtask/interfaces/ParrotBot.py +130 -0
  329. flowtask/interfaces/SSHClient.py +378 -0
  330. flowtask/interfaces/Sharepoint.py +496 -0
  331. flowtask/interfaces/__init__.py +36 -0
  332. flowtask/interfaces/azureauth.py +119 -0
  333. flowtask/interfaces/cache.py +201 -0
  334. flowtask/interfaces/client.py +82 -0
  335. flowtask/interfaces/compress.py +525 -0
  336. flowtask/interfaces/credentials.py +124 -0
  337. flowtask/interfaces/d2l.py +239 -0
  338. flowtask/interfaces/databases/__init__.py +5 -0
  339. flowtask/interfaces/databases/db.py +223 -0
  340. flowtask/interfaces/databases/documentdb.py +55 -0
  341. flowtask/interfaces/databases/rethink.py +39 -0
  342. flowtask/interfaces/dataframes/__init__.py +11 -0
  343. flowtask/interfaces/dataframes/abstract.py +21 -0
  344. flowtask/interfaces/dataframes/arrow.py +71 -0
  345. flowtask/interfaces/dataframes/dt.py +69 -0
  346. flowtask/interfaces/dataframes/pandas.py +167 -0
  347. flowtask/interfaces/dataframes/polars.py +60 -0
  348. flowtask/interfaces/db.py +263 -0
  349. flowtask/interfaces/env.py +46 -0
  350. flowtask/interfaces/func.py +137 -0
  351. flowtask/interfaces/http.py +1780 -0
  352. flowtask/interfaces/locale.py +40 -0
  353. flowtask/interfaces/log.py +75 -0
  354. flowtask/interfaces/mask.py +143 -0
  355. flowtask/interfaces/notification.py +154 -0
  356. flowtask/interfaces/playwright.py +339 -0
  357. flowtask/interfaces/powerpoint.py +368 -0
  358. flowtask/interfaces/py.typed +0 -0
  359. flowtask/interfaces/qs.py +376 -0
  360. flowtask/interfaces/result.py +87 -0
  361. flowtask/interfaces/selenium_service.py +779 -0
  362. flowtask/interfaces/smartsheet.py +154 -0
  363. flowtask/interfaces/stat.py +39 -0
  364. flowtask/interfaces/task.py +96 -0
  365. flowtask/interfaces/template.py +118 -0
  366. flowtask/interfaces/vectorstores/__init__.py +1 -0
  367. flowtask/interfaces/vectorstores/abstract.py +133 -0
  368. flowtask/interfaces/vectorstores/milvus.py +669 -0
  369. flowtask/interfaces/zammad.py +107 -0
  370. flowtask/models.py +193 -0
  371. flowtask/parsers/__init__.py +15 -0
  372. flowtask/parsers/_yaml.c +11978 -0
  373. flowtask/parsers/_yaml.cpython-312-x86_64-linux-gnu.so +0 -0
  374. flowtask/parsers/argparser.py +235 -0
  375. flowtask/parsers/base.c +15155 -0
  376. flowtask/parsers/base.cpython-312-x86_64-linux-gnu.so +0 -0
  377. flowtask/parsers/json.c +11968 -0
  378. flowtask/parsers/json.cpython-312-x86_64-linux-gnu.so +0 -0
  379. flowtask/parsers/maps.py +49 -0
  380. flowtask/parsers/toml.c +11968 -0
  381. flowtask/parsers/toml.cpython-312-x86_64-linux-gnu.so +0 -0
  382. flowtask/plugins/__init__.py +16 -0
  383. flowtask/plugins/components/__init__.py +0 -0
  384. flowtask/plugins/handler/__init__.py +45 -0
  385. flowtask/plugins/importer.py +31 -0
  386. flowtask/plugins/sources/__init__.py +0 -0
  387. flowtask/runner.py +283 -0
  388. flowtask/scheduler/__init__.py +9 -0
  389. flowtask/scheduler/functions.py +493 -0
  390. flowtask/scheduler/handlers/__init__.py +8 -0
  391. flowtask/scheduler/handlers/manager.py +504 -0
  392. flowtask/scheduler/handlers/models.py +58 -0
  393. flowtask/scheduler/handlers/service.py +72 -0
  394. flowtask/scheduler/notifications.py +65 -0
  395. flowtask/scheduler/scheduler.py +993 -0
  396. flowtask/services/__init__.py +0 -0
  397. flowtask/services/bots/__init__.py +0 -0
  398. flowtask/services/bots/telegram.py +264 -0
  399. flowtask/services/files/__init__.py +11 -0
  400. flowtask/services/files/manager.py +522 -0
  401. flowtask/services/files/model.py +37 -0
  402. flowtask/services/files/service.py +767 -0
  403. flowtask/services/jira/__init__.py +3 -0
  404. flowtask/services/jira/jira_actions.py +191 -0
  405. flowtask/services/tasks/__init__.py +13 -0
  406. flowtask/services/tasks/launcher.py +213 -0
  407. flowtask/services/tasks/manager.py +323 -0
  408. flowtask/services/tasks/service.py +275 -0
  409. flowtask/services/tasks/task_manager.py +376 -0
  410. flowtask/services/tasks/tasks.py +155 -0
  411. flowtask/storages/__init__.py +16 -0
  412. flowtask/storages/exceptions.py +12 -0
  413. flowtask/storages/files/__init__.py +8 -0
  414. flowtask/storages/files/abstract.py +29 -0
  415. flowtask/storages/files/filesystem.py +66 -0
  416. flowtask/storages/tasks/__init__.py +19 -0
  417. flowtask/storages/tasks/abstract.py +26 -0
  418. flowtask/storages/tasks/database.py +33 -0
  419. flowtask/storages/tasks/filesystem.py +108 -0
  420. flowtask/storages/tasks/github.py +119 -0
  421. flowtask/storages/tasks/memory.py +45 -0
  422. flowtask/storages/tasks/row.py +25 -0
  423. flowtask/tasks/__init__.py +0 -0
  424. flowtask/tasks/abstract.py +526 -0
  425. flowtask/tasks/command.py +118 -0
  426. flowtask/tasks/pile.py +486 -0
  427. flowtask/tasks/py.typed +0 -0
  428. flowtask/tasks/task.py +778 -0
  429. flowtask/template/__init__.py +161 -0
  430. flowtask/tests.py +257 -0
  431. flowtask/types/__init__.py +8 -0
  432. flowtask/types/typedefs.c +11347 -0
  433. flowtask/types/typedefs.cpython-312-x86_64-linux-gnu.so +0 -0
  434. flowtask/utils/__init__.py +24 -0
  435. flowtask/utils/constants.py +117 -0
  436. flowtask/utils/encoders.py +21 -0
  437. flowtask/utils/executor.py +112 -0
  438. flowtask/utils/functions.cpp +14280 -0
  439. flowtask/utils/functions.cpython-312-x86_64-linux-gnu.so +0 -0
  440. flowtask/utils/json.cpp +13349 -0
  441. flowtask/utils/json.cpython-312-x86_64-linux-gnu.so +0 -0
  442. flowtask/utils/mail.py +63 -0
  443. flowtask/utils/parseqs.c +13324 -0
  444. flowtask/utils/parserqs.cpython-312-x86_64-linux-gnu.so +0 -0
  445. flowtask/utils/stats.py +308 -0
  446. flowtask/utils/transformations.py +74 -0
  447. flowtask/utils/uv.py +12 -0
  448. flowtask/utils/validators.py +97 -0
  449. flowtask/version.py +11 -0
  450. flowtask-5.8.4.dist-info/LICENSE +201 -0
  451. flowtask-5.8.4.dist-info/METADATA +209 -0
  452. flowtask-5.8.4.dist-info/RECORD +470 -0
  453. flowtask-5.8.4.dist-info/WHEEL +6 -0
  454. flowtask-5.8.4.dist-info/entry_points.txt +3 -0
  455. flowtask-5.8.4.dist-info/top_level.txt +2 -0
  456. plugins/components/CreateQR.py +39 -0
  457. plugins/components/TestComponent.py +28 -0
  458. plugins/components/Use1.py +13 -0
  459. plugins/components/Workplace.py +117 -0
  460. plugins/components/__init__.py +3 -0
  461. plugins/sources/__init__.py +0 -0
  462. plugins/sources/get_populartimes.py +78 -0
  463. plugins/sources/google.py +150 -0
  464. plugins/sources/hubspot.py +679 -0
  465. plugins/sources/icims.py +679 -0
  466. plugins/sources/mobileinsight.py +501 -0
  467. plugins/sources/newrelic.py +262 -0
  468. plugins/sources/uap.py +268 -0
  469. plugins/sources/venu.py +244 -0
  470. plugins/sources/vocinity.py +314 -0
@@ -0,0 +1,1054 @@
1
+ from collections.abc import Callable
2
+ import asyncio
3
+ import random
4
+ import backoff
5
+ import httpx
6
+ from typing import List, Optional, Dict, Any
7
+ from tqdm.asyncio import tqdm
8
+ from fuzzywuzzy import fuzz
9
+ import pandas as pd
10
+ from bs4 import BeautifulSoup
11
+ from duckduckgo_search.exceptions import RatelimitException
12
+ from ...exceptions import ComponentError, ConfigError
13
+ from ...interfaces import HTTPService, SeleniumService
14
+ from ...interfaces.http import ua, bad_gateway_exception
15
+ from ..flow import FlowComponent
16
+ from .parsers import (
17
+ LeadiqScrapper,
18
+ ExploriumScrapper,
19
+ ZoomInfoScrapper,
20
+ SicCodeScrapper,
21
+ RocketReachScrapper,
22
+ VisualVisitorScrapper
23
+ )
24
+ import json
25
+ import re
26
+
27
+
28
+ class CompanyScraper(FlowComponent, SeleniumService, HTTPService):
29
+ """
30
+ Company Scraper Component
31
+
32
+ Overview:
33
+
34
+ This component scrapes company information from different sources using HTTPService.
35
+ It can receive URLs from a previous component (like GoogleSearch) and extract
36
+ specific company information.
37
+
38
+ .. table:: Properties
39
+ :widths: auto
40
+
41
+ +-----------------------+----------+------------------------------------------------------------------------------------------------------+
42
+ | Name | Required | Description |
43
+ +-----------------------+----------+------------------------------------------------------------------------------------------------------+
44
+ | url_column (str) | Yes | Name of the column containing URLs to scrape (default: 'search_url') |
45
+ +-----------------------+----------+------------------------------------------------------------------------------------------------------+
46
+ | wait_for (tuple) | No | Element to wait for before scraping (default: ('class', 'company-overview')) |
47
+ +-----------------------+----------+------------------------------------------------------------------------------------------------------+
48
+
49
+ Return:
50
+
51
+ The component adds new columns to the DataFrame with company information:
52
+ - headquarters
53
+ - phone_number
54
+ - website
55
+ - stock_symbol
56
+ - naics_code
57
+ - employee_count
58
+ """ # noqa: E501
59
+
60
+ def __init__(
61
+ self,
62
+ loop: asyncio.AbstractEventLoop = None,
63
+ job: Callable = None,
64
+ stat: Callable = None,
65
+ **kwargs,
66
+ ) -> None:
67
+ self.info_column: str = kwargs.get('column_name', 'company_name')
68
+ self.scrappers: list = kwargs.get('scrappers', ['leadiq'])
69
+ self.wait_for: tuple = kwargs.get('wait_for', ('class', 'company-overview'))
70
+ self._counter: int = 0
71
+ self.use_proxy: bool = True
72
+ self._free_proxy: bool = False
73
+ self.paid_proxy: bool = True
74
+ self.chunk_size: int = kwargs.get('chunk_size', 100)
75
+ self.concurrently: bool = kwargs.get('concurrently', True)
76
+ self.task_parts: int = kwargs.get('task_parts', 10)
77
+ super().__init__(loop=loop, job=job, stat=stat, **kwargs)
78
+ # Headers configuration
79
+ self.headers: dict = {
80
+ "Accept": self.accept,
81
+ "TE": "trailers",
82
+ "Accept-Encoding": "gzip, deflate",
83
+ "DNT": "1",
84
+ "Connection": "keep-alive",
85
+ "Upgrade-Insecure-Requests": "1",
86
+ "User-Agent": random.choice(ua),
87
+ **kwargs.get('headers', {})
88
+ }
89
+ self._free_proxy = False
90
+
91
+ def split_parts(self, task_list, num_parts: int = 5) -> list:
92
+ """Split task list into parts for concurrent processing."""
93
+ part_size = len(task_list) // num_parts
94
+ remainder = len(task_list) % num_parts
95
+ parts = []
96
+ start = 0
97
+ for i in range(num_parts):
98
+ # Distribute the remainder across the first `remainder` parts
99
+ end = start + part_size + (1 if i < remainder else 0)
100
+ parts.append(task_list[start:end])
101
+ start = end
102
+ return parts
103
+
104
+ async def _processing_tasks(self, tasks: list) -> pd.DataFrame:
105
+ """Process tasks concurrently and format the results."""
106
+ results = []
107
+ total_tasks = len(tasks)
108
+ with tqdm(total=total_tasks, desc="Scraping Progress", unit="task") as pbar_total: # Overall progress bar
109
+ if self.concurrently is False:
110
+ # run every task in a sequential manner:
111
+ for task in tasks:
112
+ try:
113
+ idx, row = await task
114
+ results.append((idx, row)) # Append as tuple (idx, row)
115
+ await asyncio.sleep(
116
+ random.uniform(0.5, 2)
117
+ )
118
+ except Exception as e:
119
+ self._logger.error(f"Task error: {str(e)}")
120
+ idx, row = self._get_error_info(e) # Handle error
121
+ results.append((idx, row)) # Store the failure result
122
+ finally:
123
+ pbar_total.update(1)
124
+ else:
125
+ # run all tasks concurrently
126
+ for chunk in self.split_parts(tasks, self.task_parts):
127
+ chunk_size = len(chunk)
128
+ # Usar return_exceptions=True para capturar errores sin detener la ejecución
129
+ chunk_results = await asyncio.gather(
130
+ *chunk, return_exceptions=True
131
+ )
132
+ for result in chunk_results:
133
+ if isinstance(result, Exception):
134
+ self._logger.error(f"Task error: {str(result)}")
135
+ idx, row = self._get_error_info(result) # Extract idx, row from error
136
+ results.append((idx, row))
137
+ else:
138
+ results.append(result)
139
+
140
+ pbar_total.update(chunk_size)
141
+
142
+ # Convert results to DataFrame
143
+ if not results:
144
+ return pd.DataFrame()
145
+
146
+ indices, data_dicts = zip(*results) if results else ([], [])
147
+ df = pd.DataFrame(data_dicts, index=indices)
148
+
149
+ # Ensure all expected columns exist
150
+ expected_columns = [
151
+ 'company_name',
152
+ 'logo_url',
153
+ 'address',
154
+ 'phone_number',
155
+ 'website',
156
+ 'stock_symbol',
157
+ 'naics_code',
158
+ 'sic_code',
159
+ 'employee_count',
160
+ 'revenue_range',
161
+ 'similar_companies',
162
+ 'search_term',
163
+ 'search_url'
164
+ ]
165
+
166
+ for col in expected_columns:
167
+ if col not in df.columns:
168
+ df[col] = None
169
+
170
+ return df
171
+
172
+ async def start(self, **kwargs) -> bool:
173
+ """Initialize the component and validate required parameters."""
174
+ if self.previous:
175
+ self.data = self.input
176
+
177
+ if not isinstance(self.data, pd.DataFrame):
178
+ raise ComponentError(
179
+ "Input must be a DataFrame", status=404
180
+ )
181
+
182
+ if self.info_column not in self.data.columns:
183
+ raise ConfigError(
184
+ f"Column {self.info_column} not found in DataFrame"
185
+ )
186
+
187
+ # Initialize result columns
188
+ new_columns = [
189
+ 'search_term',
190
+ 'search_url',
191
+ 'company_name',
192
+ 'logo_url',
193
+ 'address',
194
+ 'phone_number',
195
+ 'website',
196
+ 'stock_symbol',
197
+ 'naics_code',
198
+ 'sic_code',
199
+ 'employee_count',
200
+ 'revenue_range',
201
+ 'similar_companies',
202
+ 'industry_category',
203
+ 'industry',
204
+ 'category',
205
+ 'company_description',
206
+ 'city',
207
+ 'state',
208
+ 'zip_code',
209
+ 'country',
210
+ 'metro_area',
211
+ 'headquarters',
212
+ 'location',
213
+ 'number_employees',
214
+ 'founded',
215
+ 'search_status',
216
+ 'scrape_status'
217
+ ]
218
+ for col in new_columns:
219
+ if col not in self.data.columns:
220
+ self.data[col] = None
221
+
222
+ return True
223
+
224
+ def extract_company_info(self, soup: BeautifulSoup, search_term: str, search_url: str) -> Dict[str, Any]:
225
+ """Extract company information from the page."""
226
+ result = {}
227
+ result['search_term'] = search_term
228
+ result['search_url'] = search_url
229
+ # Get company name and logo URL from logo image
230
+ logo = soup.find('img', {'alt': True, 'width': '76.747'})
231
+ if logo:
232
+ result['company_name'] = logo.get('alt')
233
+ result['logo_url'] = logo.get('src')
234
+
235
+ # Get company revenue range from highlight-right section
236
+ highlight_right = soup.find('div', {'class': 'highlight-right'})
237
+ if highlight_right:
238
+ revenue_span = highlight_right.find('span', {'class': 'start'})
239
+ if revenue_span:
240
+ start_value = revenue_span.text.strip()
241
+ end_span = revenue_span.find_next_sibling('span', {'class': 'end'})
242
+ if end_span:
243
+ end_value = end_span.text.strip()
244
+ result['revenue_range'] = f"{start_value} - {end_value}"
245
+ else:
246
+ result['revenue_range'] = start_value
247
+
248
+ # First find the highlight-left section that contains company info
249
+ highlight_left = soup.find('div', {'class': 'highlight-left'})
250
+ if not highlight_left:
251
+ self._logger.warning("Could not find highlight-left section")
252
+ return result
253
+
254
+ # Then find the card span within highlight-left
255
+ overview_section = highlight_left.find('div', {'class': 'card span'})
256
+ if not overview_section:
257
+ return result
258
+
259
+ # Extract information from dl/dt/dd elements
260
+ dl_element = overview_section.find('dl')
261
+ if dl_element:
262
+ for item in dl_element.find_all('div', {'class': 'item'}):
263
+ dt = item.find('dt')
264
+ dd = item.find('dd')
265
+ if dt and dd:
266
+ field = dt.text.strip().lower()
267
+ value = dd.text.strip()
268
+
269
+ # Map fields to our column names
270
+ if field == 'headquarters':
271
+ result['address'] = value
272
+ elif field == 'phone number':
273
+ phone = value.replace('****', '0000')
274
+ result['phone_number'] = phone
275
+ elif field == 'website':
276
+ website = dd.find('a')
277
+ result['website'] = website['href'] if website else value
278
+ elif field == 'stock symbol':
279
+ result['stock_symbol'] = value
280
+ elif field == 'naics code':
281
+ result['naics_code'] = value
282
+ elif field == 'employees':
283
+ result['employee_count'] = value
284
+ elif field == 'sic code':
285
+ result['sic_code'] = value
286
+
287
+ # Extract similar companies
288
+ similar_companies = []
289
+ similar_section = soup.find('div', {'id': 'similar'})
290
+ if similar_section:
291
+ for company in similar_section.find_all('li'):
292
+ company_link = company.find('a')
293
+ if not company_link:
294
+ continue
295
+
296
+ company_logo = company_link.find('img')
297
+ company_name = company_link.find('h3')
298
+
299
+ # Find revenue span
300
+ revenue_spans = company_link.find_all('span')
301
+ revenue_span = None
302
+ for span in revenue_spans:
303
+ if span.find('span', {'class': 'start'}):
304
+ revenue_span = span
305
+ break
306
+
307
+ if company_name:
308
+ similar_company = {
309
+ 'name': company_name.text.strip(), # No escapamos las comillas
310
+ 'leadiq_url': company_link['href'],
311
+ 'logo_url': company_logo['src'] if company_logo else None,
312
+ }
313
+
314
+ # Extract revenue range
315
+ if revenue_span:
316
+ start = revenue_span.find('span', {'class': 'start'})
317
+ end = revenue_span.find('span', {'class': 'end'})
318
+
319
+ if start:
320
+ start_value = start.text.strip()
321
+ if end:
322
+ end_value = end.text.strip()
323
+ similar_company['revenue_range'] = f"{start_value} - {end_value}"
324
+ else:
325
+ similar_company['revenue_range'] = start_value
326
+
327
+ similar_companies.append(similar_company)
328
+
329
+ if similar_companies:
330
+ try:
331
+ # Convertir a string JSON con las opciones correctas para PostgreSQL
332
+ result['similar_companies'] = json.dumps(
333
+ similar_companies,
334
+ ensure_ascii=False, # Permitir caracteres Unicode
335
+ allow_nan=False, # No permitir NaN/Infinity
336
+ separators=(',', ':') # Usar formato compacto
337
+ )
338
+ except Exception as e:
339
+ self._logger.error(f"Error formatting similar companies JSON: {str(e)}")
340
+ result['similar_companies'] = None
341
+
342
+ if not result:
343
+ self._logger.warning("No data was extracted from the page")
344
+ else:
345
+ self._logger.info("Successfully extracted data")
346
+
347
+ return result
348
+
349
+ @backoff.on_exception(
350
+ backoff.expo,
351
+ (httpx.ConnectTimeout, httpx.ReadTimeout, httpx.HTTPStatusError),
352
+ max_tries=3,
353
+ max_time=60,
354
+ giveup=lambda e: not bad_gateway_exception(e) and not isinstance(e, (httpx.ConnectTimeout, httpx.ReadTimeout))
355
+ )
356
+ async def scrape_url(self, idx: int, url: str) -> tuple[int, Optional[Dict[str, Any]]]:
357
+ """Scrape company information from URL."""
358
+ if not url:
359
+ return idx, None
360
+
361
+ try:
362
+ # Determinar qué tipo de URL es
363
+ if 'leadiq.com' in url:
364
+ return await self._scrape_leadiq(idx, url)
365
+ elif 'explorium.ai' in url:
366
+ return await self._scrape_explorium(idx, url)
367
+ else:
368
+ self._logger.warning(f"Unsupported URL domain: {url}")
369
+ return idx, None
370
+
371
+ except Exception as e:
372
+ self._logger.error(f"Error scraping {url}: {str(e)}")
373
+ return idx, None
374
+
375
+ def _parse_address(self, address: str) -> Dict[str, str]:
376
+ """Parse address string to extract state, zipcode and country."""
377
+ if not address:
378
+ return {
379
+ 'address': None,
380
+ 'state': None,
381
+ 'zipcode': None,
382
+ 'country': None
383
+ }
384
+
385
+ # Mantener la dirección original
386
+ result = {'address': address}
387
+
388
+ # Primera regex para formato completo
389
+ pattern1 = r'^.*,\s+([^,]+?)\s+([\w\s-]+)\s+([A-Z]{2})$'
390
+ # Segunda regex como fallback
391
+ pattern2 = r'^.*,\s*([^,]+?),\s+([\w\s-]+?)\s*([A-Z]{2})'
392
+
393
+ try:
394
+ # Intentar con la primera regex
395
+ match = re.search(pattern1, address)
396
+ if not match:
397
+ # Si no hay match, intentar con la segunda
398
+ match = re.search(pattern2, address)
399
+
400
+ if match:
401
+ result['state'] = match.group(1).strip()
402
+ result['zipcode'] = match.group(2).strip()
403
+ result['country'] = match.group(3).strip()
404
+ else:
405
+ self._logger.warning(f"Could not parse address: {address}")
406
+ result.update({
407
+ 'state': None,
408
+ 'zipcode': None,
409
+ 'country': None
410
+ })
411
+ except Exception as e:
412
+ self._logger.error(f"Error parsing address {address}: {str(e)}")
413
+ result.update({
414
+ 'state': None,
415
+ 'zipcode': None,
416
+ 'country': None
417
+ })
418
+
419
+ return result
420
+
421
+ @backoff.on_exception(
422
+ backoff.expo,
423
+ (httpx.ConnectTimeout, httpx.ReadTimeout, httpx.HTTPStatusError),
424
+ max_tries=3,
425
+ max_time=60,
426
+ giveup=lambda e: not bad_gateway_exception(e) and not isinstance(e, (httpx.ConnectTimeout, httpx.ReadTimeout))
427
+ )
428
+ async def _scrape_explorium(self, idx: int, url: str) -> tuple[int, Optional[Dict[str, Any]]]:
429
+ """Scrape company information from Explorium.ai."""
430
+ # Inicializar el resultado con valores por defecto
431
+ result = {
432
+ 'search_term': self.data.iloc[idx].get('search_term', ''),
433
+ 'search_url': url,
434
+ 'source_platform': 'explorium',
435
+ 'company_name': None,
436
+ 'logo_url': None,
437
+ 'address': None,
438
+ 'state': None,
439
+ 'zipcode': None,
440
+ 'country': None,
441
+ 'phone_number': None,
442
+ 'website': None,
443
+ 'stock_symbol': None,
444
+ 'naics_code': None,
445
+ 'sic_code': None,
446
+ 'employee_count': None,
447
+ 'revenue_range': None,
448
+ 'similar_companies': None,
449
+ 'scrape_status': 'pending'
450
+ }
451
+
452
+ try:
453
+ self._logger.notice(f"Scraping Explorium URL: {url}")
454
+
455
+ self.headers["User-Agent"] = random.choice(ua)
456
+
457
+ # Usar el cliente HTTP con timeout
458
+ async with httpx.AsyncClient(timeout=30.0) as client:
459
+ response = await self._get(url, headers=self.headers)
460
+
461
+ if response.status_code != 200:
462
+ self._logger.error(f"Failed to fetch URL {url}: {response.status_code}")
463
+ return idx, None
464
+
465
+ await asyncio.sleep(random.uniform(1, 3))
466
+
467
+ content = response.text
468
+ soup = BeautifulSoup(content, 'html.parser')
469
+
470
+ # Extraer nombre de la compañía
471
+ title = soup.find('h1')
472
+ if title:
473
+ result['company_name'] = title.text.strip()
474
+
475
+ # Extraer logo si existe
476
+ logo = soup.find('img', {'class': 'company-logo'}) # Ajustar selector según HTML
477
+ if logo:
478
+ result['logo_url'] = logo.get('src')
479
+
480
+ # Extraer otros detalles
481
+ details = soup.find_all('div', {'class': 'company-detail'})
482
+ for detail in details:
483
+ label = detail.find('span', {'class': 'label'})
484
+ value = detail.find('span', {'class': 'value'})
485
+ if label and value:
486
+ label_text = label.text.strip().lower()
487
+ value_text = value.text.strip()
488
+
489
+ # Mapear campos de Explorium a la estructura de LeadIQ
490
+ if 'website' in label_text:
491
+ result['website'] = value_text
492
+ elif 'location' in label_text:
493
+ address_info = self._parse_address(value_text)
494
+ result.update(address_info)
495
+ elif 'size' in label_text or 'employees' in label_text:
496
+ result['employee_count'] = value_text
497
+ elif 'revenue' in label_text:
498
+ result['revenue_range'] = value_text
499
+ elif 'naics' in label_text:
500
+ result['naics_code'] = value_text
501
+ elif 'sic' in label_text:
502
+ result['sic_code'] = value_text
503
+ elif 'phone' in label_text:
504
+ result['phone_number'] = value_text
505
+ elif 'stock' in label_text:
506
+ result['stock_symbol'] = value_text
507
+
508
+ # Extraer compañías similares si existen
509
+ similar_section = soup.find('div', {'class': 'similar-companies'}) # Ajustar selector
510
+ if similar_section:
511
+ similar_companies = []
512
+ for company in similar_section.find_all('div', {'class': 'company-card'}): # Ajustar selector
513
+ company_name = company.find('h3')
514
+ if company_name:
515
+ similar_company = {
516
+ 'name': company_name.text.strip(),
517
+ 'explorium_url': company.find('a')['href'] if company.find('a') else None,
518
+ 'logo_url': company.find('img')['src'] if company.find('img') else None,
519
+ }
520
+ similar_companies.append(similar_company)
521
+
522
+ if similar_companies:
523
+ try:
524
+ result['similar_companies'] = json.dumps(
525
+ similar_companies,
526
+ ensure_ascii=False,
527
+ allow_nan=False,
528
+ separators=(',', ':')
529
+ )
530
+ except Exception as e:
531
+ self._logger.error(f"Error formatting similar companies JSON: {str(e)}")
532
+
533
+ if result:
534
+ self._counter += 1
535
+ result['scrape_status'] = 'success'
536
+ return idx, result
537
+
538
+ return idx, None
539
+
540
+ except httpx.TimeoutException as e:
541
+ self._logger.error(f"Timeout scraping Explorium URL {url}: {str(e)}")
542
+ result['scrape_status'] = 'timeout'
543
+ return idx, result
544
+ except Exception as e:
545
+ self._logger.error(f"Error scraping Explorium URL {url}: {str(e)}")
546
+ result['scrape_status'] = f'error: {str(e)[:50]}'
547
+ return idx, result
548
+
549
+ @backoff.on_exception(
550
+ backoff.expo,
551
+ (httpx.ConnectTimeout, httpx.ReadTimeout, httpx.HTTPStatusError),
552
+ max_tries=3,
553
+ max_time=60,
554
+ giveup=lambda e: not bad_gateway_exception(e) and not isinstance(e, (httpx.ConnectTimeout, httpx.ReadTimeout))
555
+ )
556
+ async def _scrape_leadiq(self, idx: int, url: str) -> tuple[int, Optional[Dict[str, Any]]]:
557
+ """Scrape company information from LeadIQ."""
558
+ # Inicializar el resultado con valores por defecto
559
+ result = {
560
+ 'search_term': self.data.iloc[idx].get('search_term', ''),
561
+ 'search_url': url,
562
+ 'source_platform': 'leadiq',
563
+ 'company_name': None,
564
+ 'logo_url': None,
565
+ 'address': None,
566
+ 'state': None,
567
+ 'zipcode': None,
568
+ 'country': None,
569
+ 'phone_number': None,
570
+ 'website': None,
571
+ 'stock_symbol': None,
572
+ 'naics_code': None,
573
+ 'sic_code': None,
574
+ 'employee_count': None,
575
+ 'revenue_range': None,
576
+ 'similar_companies': None,
577
+ 'scrape_status': 'pending'
578
+ }
579
+
580
+ try:
581
+ self._logger.notice(f"Scraping LeadIQ URL: {url}")
582
+
583
+ self.headers["User-Agent"] = random.choice(ua)
584
+
585
+ # Usar el cliente HTTP con timeout
586
+ async with httpx.AsyncClient(timeout=30.0) as client:
587
+ response = await self._get(url, headers=self.headers)
588
+
589
+ if response.status_code != 200:
590
+ self._logger.error(f"Failed to fetch URL {url}: {response.status_code}")
591
+ return idx, None
592
+
593
+ await asyncio.sleep(random.uniform(1, 3))
594
+
595
+ content = response.text
596
+ soup = BeautifulSoup(content, 'html.parser')
597
+
598
+ # Get company name and logo URL from logo image
599
+ logo = soup.find('img', {'alt': True, 'width': '76.747'})
600
+ if logo:
601
+ result['company_name'] = logo.get('alt')
602
+ result['logo_url'] = logo.get('src')
603
+
604
+ # Get company revenue range from highlight-right section
605
+ highlight_right = soup.find('div', {'class': 'highlight-right'})
606
+ if highlight_right:
607
+ revenue_span = highlight_right.find('span', {'class': 'start'})
608
+ if revenue_span:
609
+ start_value = revenue_span.text.strip()
610
+ end_span = revenue_span.find_next_sibling('span', {'class': 'end'})
611
+ if end_span:
612
+ end_value = end_span.text.strip()
613
+ result['revenue_range'] = f"{start_value} - {end_value}"
614
+ else:
615
+ result['revenue_range'] = start_value
616
+
617
+ # First find the highlight-left section that contains company info
618
+ highlight_left = soup.find('div', {'class': 'highlight-left'})
619
+ if not highlight_left:
620
+ self._logger.warning("Could not find highlight-left section")
621
+ return result
622
+
623
+ # Then find the card span within highlight-left
624
+ overview_section = highlight_left.find('div', {'class': 'card span'})
625
+ if not overview_section:
626
+ return result
627
+
628
+ # Extract information from dl/dt/dd elements
629
+ dl_element = overview_section.find('dl')
630
+ if dl_element:
631
+ for item in dl_element.find_all('div', {'class': 'item'}):
632
+ dt = item.find('dt')
633
+ dd = item.find('dd')
634
+ if dt and dd:
635
+ field = dt.text.strip().lower()
636
+ value = dd.text.strip()
637
+
638
+ # Map fields to our column names
639
+ if field == 'headquarters':
640
+ address_info = self._parse_address(value)
641
+ result.update(address_info)
642
+ elif field == 'phone number':
643
+ phone = value.replace('****', '0000')
644
+ result['phone_number'] = phone
645
+ elif field == 'website':
646
+ website = dd.find('a')
647
+ result['website'] = website['href'] if website else value
648
+ elif field == 'stock symbol':
649
+ result['stock_symbol'] = value
650
+ elif field == 'naics code':
651
+ result['naics_code'] = value
652
+ elif field == 'employees':
653
+ result['employee_count'] = value
654
+ elif field == 'sic code':
655
+ result['sic_code'] = value
656
+
657
+ # Extract similar companies
658
+ similar_companies = []
659
+ similar_section = soup.find('div', {'id': 'similar'})
660
+ if similar_section:
661
+ for company in similar_section.find_all('li'):
662
+ company_link = company.find('a')
663
+ if not company_link:
664
+ continue
665
+
666
+ company_logo = company_link.find('img')
667
+ company_name = company_link.find('h3')
668
+
669
+ # Find revenue span
670
+ revenue_spans = company_link.find_all('span')
671
+ revenue_span = None
672
+ for span in revenue_spans:
673
+ if span.find('span', {'class': 'start'}):
674
+ revenue_span = span
675
+ break
676
+
677
+ if company_name:
678
+ similar_company = {
679
+ 'name': company_name.text.strip(), # No escapamos las comillas
680
+ 'leadiq_url': company_link['href'],
681
+ 'logo_url': company_logo['src'] if company_logo else None,
682
+ }
683
+
684
+ # Extract revenue range
685
+ if revenue_span:
686
+ start = revenue_span.find('span', {'class': 'start'})
687
+ end = revenue_span.find('span', {'class': 'end'})
688
+
689
+ if start:
690
+ start_value = start.text.strip()
691
+ if end:
692
+ end_value = end.text.strip()
693
+ similar_company['revenue_range'] = f"{start_value} - {end_value}"
694
+ else:
695
+ similar_company['revenue_range'] = start_value
696
+
697
+ similar_companies.append(similar_company)
698
+
699
+ if similar_companies:
700
+ try:
701
+ # Convertir a string JSON con las opciones correctas para PostgreSQL
702
+ result['similar_companies'] = json.dumps(
703
+ similar_companies,
704
+ ensure_ascii=False, # Permitir caracteres Unicode
705
+ allow_nan=False, # No permitir NaN/Infinity
706
+ separators=(',', ':') # Usar formato compacto
707
+ )
708
+ except Exception as e:
709
+ self._logger.error(f"Error formatting similar companies JSON: {str(e)}")
710
+ result['similar_companies'] = None
711
+
712
+ if result:
713
+ self._counter += 1
714
+ result['scrape_status'] = 'success'
715
+ return idx, result
716
+
717
+ return idx, None
718
+
719
+ except httpx.TimeoutException as e:
720
+ self._logger.error(f"Timeout scraping LeadIQ URL {url}: {str(e)}")
721
+ result['scrape_status'] = 'timeout'
722
+ return idx, result
723
+ except Exception as e:
724
+ self._logger.error(f"Error scraping LeadIQ URL {url}: {str(e)}")
725
+ result['scrape_status'] = f'error: {str(e)[:50]}'
726
+ return idx, result
727
+
728
+ def _check_company_name(self, company_name: str, title: str, scrapper: Any):
729
+ # Extract the Company Name from the title provided
730
+ pattern = r'\b(' + '|'.join(re.escape(kw) for kw in scrapper.keywords) + r')\b'
731
+ # Search for the first occurrence of any keyword
732
+ match = re.search(pattern, title, re.IGNORECASE)
733
+ if not match:
734
+ return False
735
+
736
+ result = title[:match.start()].strip()
737
+ if not result: # Si result está vacío
738
+ return False
739
+
740
+ company = company_name.strip()
741
+ # print('Company Name: ', company_name)
742
+ # print("COMPANY > ", result)
743
+ if company.lower() == result.lower():
744
+ return True
745
+
746
+ # second way, normalize names reducing to one element each:
747
+ cp = result.split()[0]
748
+ cp2 = company.split()[0]
749
+ if cp.lower() == cp2.lower():
750
+ return True
751
+
752
+ # Check with Fuzzy Search if Company matches.
753
+ score = fuzz.ratio(company.lower(), result.lower())
754
+ if score > 85:
755
+ return True
756
+
757
+ return False
758
+
759
+ def _standardize_name(self, text: str) -> str:
760
+ """Estandariza el formato del texto: lowercase y guiones en lugar de espacios."""
761
+ # Primero limpiamos caracteres especiales y espacios extras
762
+ cleaned = text.strip().lower().replace(' ', '-')
763
+ return f"\'{cleaned}\'"
764
+
765
+ async def search_in_ddg(
766
+ self,
767
+ search_term: str,
768
+ company_name: str,
769
+ scrapper: Any,
770
+ backend: str = 'html',
771
+ region: str = 'wt-wt'
772
+ ):
773
+ """Search for a term in DuckDuckGo."""
774
+ try:
775
+ results = await self._search_duckduckgo(
776
+ search_term,
777
+ use_proxy=True,
778
+ headers=self.headers,
779
+ max_results=10,
780
+ backend=backend,
781
+ region=region,
782
+ )
783
+ if not results:
784
+ raise RuntimeError("Could not find any results")
785
+ if company := self._company_exists(results, company_name, scrapper):
786
+ return company
787
+ else:
788
+ raise RuntimeError(
789
+ "Could not find a company matching the search term"
790
+ )
791
+ except (RatelimitException, RuntimeError) as e:
792
+ self._logger.warning(f'Search Error: {e}')
793
+ raise RuntimeError('Search Error')
794
+
795
+ async def search_in_google(
796
+ self,
797
+ search_term,
798
+ company_name: str,
799
+ scrapper: Any,
800
+ use_selenium: bool = False
801
+ ):
802
+ # Try to find company on Google Search:
803
+ try:
804
+ if use_selenium:
805
+ results = await self.search_google_cse(search_term, max_results=10)
806
+ else:
807
+ try:
808
+ response = await self._search_google(
809
+ search_term,
810
+ use_proxy=True,
811
+ headers=self.headers,
812
+ max_results=10,
813
+ region='us',
814
+ language='lang_en',
815
+ country='countryUS'
816
+ )
817
+ results = response.get('items', [])
818
+ except (httpx.ConnectError, httpx.RemoteProtocolError, httpx.WriteTimeout) as e:
819
+ self._logger.warning(
820
+ f"Connection error with Google API: {str(e)}, trying with Selenium..."
821
+ )
822
+ try:
823
+ results = await self.search_google_cse(search_term, max_results=10)
824
+ except (RuntimeError, ComponentError):
825
+ raise RuntimeError("Could not find any results")
826
+ if company := self._company_exists(results, company_name, scrapper):
827
+ return company
828
+ else:
829
+ raise RuntimeError(
830
+ "Could not find a company matching the search term"
831
+ )
832
+ except RuntimeError as e:
833
+ if str(e) == "No results found":
834
+ self._logger.warning(f"No results found for search term: {search_term}")
835
+ raise RuntimeError(
836
+ "Could not find a company matching the search term"
837
+ )
838
+
839
+ def _company_exists(self, results: list, company: str, scrapper: Any):
840
+ # Check if the Company Name is present in the title of the search results.
841
+ for r in results:
842
+ title = r.get('title', None)
843
+ # print('TITLE > ', title)
844
+ if not title:
845
+ continue
846
+ if any(keyword in title for keyword in scrapper.keywords):
847
+ # print('KEYword > ', title)
848
+ if self._check_company_name(company, title, scrapper):
849
+ self._logger.debug(f"Company Found: {company}")
850
+ return r
851
+ return None
852
+
853
+ async def _search_company(self, idx, row, cookies):
854
+ try:
855
+ async with self._semaphore:
856
+ # Extract the Company Name:
857
+ company_name = row[self.info_column]
858
+ # Let's mark this company as not found.
859
+ row['search_status'] = 'Not Found'
860
+ # Wait a random amount of time between 1 and 2 seconds to avoid
861
+ # DuckDuckGo rate limiting.
862
+ await asyncio.sleep(
863
+ random.uniform(1, 2)
864
+ )
865
+ # First step, search for Company in DuckDuckGo or fallback in Google (GSE):
866
+ for scrapper in self.scrappers:
867
+ search_term = scrapper.define_search_term(company_name)
868
+ ## search_term = scrapper.search_term.format(standardized_term)
869
+ scrapper.search_term_used = search_term
870
+ self._logger.notice(f"Searching for: {search_term}")
871
+
872
+ try:
873
+ company = await self.search_in_ddg(
874
+ search_term, company_name, scrapper
875
+ )
876
+ except RuntimeError as e:
877
+ self._logger.warning(f'Search Error: {e}')
878
+ try:
879
+ company = await self.search_in_google(
880
+ search_term, company_name, scrapper
881
+ )
882
+ except RuntimeError:
883
+ try:
884
+ company = await self.search_in_google(
885
+ search_term,
886
+ company_name,
887
+ scrapper,
888
+ use_selenium=True
889
+ )
890
+ except Exception as e:
891
+ self._logger.error(f"Search failed: {str(e)}")
892
+ row['search_status'] = f'Failed: {str(e)}'
893
+ continue
894
+ if not company:
895
+ continue
896
+
897
+ # Second, extract URL from search results:
898
+ url = company.get('link', None)
899
+ if not url:
900
+ url = company.get('href', company.get('url', None))
901
+ if not url:
902
+ row['search_status'] = 'URL not found'
903
+ continue
904
+
905
+ # Limpiar la URL de sufijos no deseados
906
+ if '/employee-directory' in url:
907
+ url = url.replace('/employee-directory', '')
908
+ elif '/email-format' in url:
909
+ url = url.replace('/email-format', '')
910
+
911
+ try:
912
+ row['search_url'] = url
913
+ company_page = await scrapper.get(url, headers=self.headers)
914
+ if not company_page:
915
+ continue
916
+ except (httpx.WriteTimeout, httpx.ConnectError, httpx.RemoteProtocolError, httpx.HTTPError) as e:
917
+ self._logger.warning(f"HTTP error accessing {url}: {str(e)}")
918
+ # Intentar con Selenium como fallback
919
+ try:
920
+ driver = await self.get_driver()
921
+ await asyncio.sleep(2) # Dar tiempo para que la página cargue
922
+ driver.get(url)
923
+ company_page_text = driver.page_source
924
+ company = BeautifulSoup(company_page_text, 'html.parser')
925
+ _, scraped_data = await scrapper.scrapping(company, idx, row)
926
+ if scraped_data is not None and scraped_data['scrape_status'] == 'success':
927
+ row.update(scraped_data)
928
+ row['search_status'] = f'Found in {scrapper.domain}'
929
+ return idx, row
930
+ except Exception as se:
931
+ self._logger.error(f"Selenium fallback failed: {str(se)}")
932
+ continue
933
+ finally:
934
+ self.close_driver()
935
+ continue
936
+
937
+ # Third, scrape company information from content:
938
+ company = BeautifulSoup(company_page.text, 'html.parser')
939
+ scraped_idx, scraped_data = await scrapper.scrapping(company, idx, row)
940
+ if scraped_data is not None and scraped_data['scrape_status'] == 'success':
941
+ await asyncio.sleep(1.5)
942
+ row.update(scraped_data)
943
+ row['search_status'] = f'Found in {scrapper.domain}'
944
+ return idx, row
945
+ # Third, scrape company information from URL:
946
+ row['search_status'] = 'Not Found on any website'
947
+ return idx, row
948
+ except Exception as e:
949
+ # Marcar la fila como fallida y preservar la información
950
+ row['search_status'] = f'Failed: {str(e)}'
951
+ # Crear una excepción que contenga idx y row
952
+ error = RuntimeError(f"Search failed: {str(e)}")
953
+ error.idx = idx
954
+ error.row = row
955
+ raise error
956
+
957
+ async def run(self):
958
+ """Execute scraping for each URL in the DataFrame."""
959
+ httpx_cookies = self.get_httpx_cookies(
960
+ domain='leadiq.com', cookies=self.cookies
961
+ )
962
+ scrappers = []
963
+ for scrapper in self.scrappers:
964
+ if scrapper == 'leadiq':
965
+ httpx_cookies = self.get_httpx_cookies(
966
+ domain='.leadiq.com', cookies=self.cookies
967
+ )
968
+ scp = LeadiqScrapper(
969
+ cookies=httpx_cookies
970
+ )
971
+ scrappers.append(
972
+ scp
973
+ )
974
+ if scrapper == 'explorium':
975
+ httpx_cookies = self.get_httpx_cookies(
976
+ domain='explorium.ai', cookies=self.cookies
977
+ )
978
+ scp = ExploriumScrapper(
979
+ cookies=httpx_cookies
980
+ )
981
+ scrappers.append(
982
+ scp
983
+ )
984
+ if scrapper == 'zoominfo':
985
+ httpx_cookies = self.get_httpx_cookies(
986
+ domain='zoominfo.com', cookies=self.cookies
987
+ )
988
+ scp = ZoomInfoScrapper(
989
+ cookies=httpx_cookies
990
+ )
991
+ scrappers.append(
992
+ scp
993
+ )
994
+ if scrapper == 'siccode':
995
+ httpx_cookies = self.get_httpx_cookies(
996
+ domain='siccode.com', cookies=self.cookies
997
+ )
998
+ scp = SicCodeScrapper(
999
+ cookies=httpx_cookies
1000
+ )
1001
+ scrappers.append(
1002
+ scp
1003
+ )
1004
+ if scrapper == 'rocketreach':
1005
+ httpx_cookies = self.get_httpx_cookies(
1006
+ domain='rocketreach.co', cookies=self.cookies
1007
+ )
1008
+ scp = RocketReachScrapper(
1009
+ cookies=httpx_cookies
1010
+ )
1011
+ scrappers.append(
1012
+ scp
1013
+ )
1014
+ if scrapper == 'visualvisitor':
1015
+ httpx_cookies = self.get_httpx_cookies(
1016
+ domain='visualvisitor.com', cookies=self.cookies
1017
+ )
1018
+ scp = VisualVisitorScrapper(
1019
+ cookies=httpx_cookies
1020
+ )
1021
+ scrappers.append(
1022
+ scp
1023
+ )
1024
+ # else:
1025
+ # self._logger.warning(
1026
+ # f"Unsupported scrapper: {scrapper}"
1027
+ # )
1028
+ # return scrappers list to self.scrappers
1029
+ self.scrappers = scrappers
1030
+ if not scrappers:
1031
+ raise ConfigError(
1032
+ "No valid scrappers were found or provided in configuration"
1033
+ )
1034
+ tasks = [
1035
+ self._search_company(
1036
+ idx, row, httpx_cookies
1037
+ ) for idx, row in self.data.iterrows()
1038
+ ]
1039
+ companies_info = await self._processing_tasks(tasks)
1040
+ self._print_data_(companies_info, 'Company Search Results')
1041
+
1042
+ self._result = companies_info
1043
+ return self._result
1044
+
1045
+ async def close(self):
1046
+ """Clean up resources."""
1047
+ return True
1048
+
1049
+ def _get_error_info(self, error):
1050
+ """Extrae idx y row de un error."""
1051
+ if hasattr(error, 'idx') and hasattr(error, 'row'):
1052
+ return error.idx, error.row
1053
+ # Si no podemos obtener la info, crear una fila con información básica
1054
+ return None, {'search_status': f'Failed: {str(error)}'}