amz-extractor 0.9.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,10 @@
1
+ Metadata-Version: 2.1
2
+ Name: amz_extractor
3
+ Version: 0.9.9
4
+ Summary: 提取亚马逊详情页和评论信息
5
+ Author: __token__
6
+ License: UNKNOWN
7
+ Platform: UNKNOWN
8
+
9
+ UNKNOWN
10
+
@@ -0,0 +1,2 @@
1
+ from .listing_parser import ListingParser
2
+ from .review_parser import ReviewParser
@@ -0,0 +1,869 @@
1
+ import json
2
+ import re
3
+ from functools import lru_cache
4
+
5
+ import dateparser
6
+ from pyquery import PyQuery as pq
7
+
8
+ CATEGORY_CODES_BY_COUNTRY = [
9
+ {
10
+ "country": "us",
11
+ "categoryCodes": {
12
+ "all departments": "aps",
13
+ "audible books & originals": "audible",
14
+ "alexa skills": "alexa-skills",
15
+ "amazon devices": "amazon-devices",
16
+ "amazon fresh": "amazonfresh",
17
+ "amazon warehouse": "warehouse-deals",
18
+ "appliances": "appliances",
19
+ "apps & games": "mobile-apps",
20
+ "arts, crafts & sewing": "arts-crafts",
21
+ "automotive parts & accessories": "automotive",
22
+ "baby": "fashion-baby",
23
+ "beauty & personal care": "beauty",
24
+ "books": "stripbooks",
25
+ "cds & vinyl": "popular",
26
+ "cell phones & accessories": "mobile",
27
+ "clothing, shoes & jewelry": "fashion",
28
+ "women": "fashion-womens",
29
+ "men": "fashion-mens",
30
+ "girls": "fashion-girls",
31
+ "boys": "fashion-boys",
32
+ "collectibles & fine art": "collectibles",
33
+ "computers": "computers",
34
+ "courses": "courses",
35
+ "credit and payment cards": "financial",
36
+ "digital music": "digital-music",
37
+ "electronics": "electronics",
38
+ "garden & outdoor": "lawngarden",
39
+ "gift cards": "gift-cards",
40
+ "grocery & gourmet food": "grocery",
41
+ "handmade": "handmade",
42
+ "health, household & baby care": "hpc",
43
+ "home & business services": "local-services",
44
+ "home & kitchen": "garden",
45
+ "industrial & scientific": "industrial",
46
+ "just for prime": "prime-exclusive",
47
+ "kindle store": "digital-text",
48
+ "luggage & travel gear": "fashion-luggage",
49
+ "luxury beauty": "luxury-beauty",
50
+ "magazine subscriptions": "magazines",
51
+ "movies & tv": "movies-tv",
52
+ "musical instruments": "mi",
53
+ "office products": "office-products",
54
+ "pet supplies": "pets",
55
+ "prime pantry": "pantry",
56
+ "prime video": "instant-video",
57
+ "software": "software",
58
+ "sports & outdoors": "sporting",
59
+ "tools & home improvement": "tools",
60
+ "toys & games": "toys-and-games",
61
+ "vehicles": "vehicles",
62
+ "video games": "videogames"
63
+ }
64
+ },
65
+ {
66
+ "country": "us-intl",
67
+ "categoryCodes": {
68
+ "all departments": "aps",
69
+ "arts & crafts": "arts-crafts",
70
+ "automotive": "automotive",
71
+ "baby": "baby-products",
72
+ "beauty & personal care": "beauty",
73
+ "books": "stripbooks",
74
+ "computers": "computers",
75
+ "digital music": "digital-music",
76
+ "electronics": "electronics",
77
+ "kindle store": "digital-text",
78
+ "prime video": "instant-video",
79
+ "women's fashion": "fashion-womens",
80
+ "men's fashion": "fashion-mens",
81
+ "girls' fashion": "fashion-girls",
82
+ "boys' fashion": "fashion-boys",
83
+ "deals": "deals",
84
+ "health & household": "hpc",
85
+ "home & kitchen": "kitchen",
86
+ "industrial & scientific": "industrial",
87
+ "luggage": "luggage",
88
+ "movies & tv": "movies-tv",
89
+ "music, cds & vinyl": "music",
90
+ "pet supplies": "pets",
91
+ "software": "software",
92
+ "sports & outdoors": "sporting",
93
+ "tools & home improvement": "tools",
94
+ "toys & games": "toys-and-games",
95
+ "video games": "videogames"
96
+ }
97
+ },
98
+ {
99
+ "country": "ca",
100
+ "categoryCodes": {
101
+ "all departments": "aps",
102
+ "alexa skills": "alexa-skills",
103
+ "amazon devices": "amazon-devices",
104
+ "amazon warehouse deals": "warehouse-deals",
105
+ "apps & games": "mobile-apps",
106
+ "automotive": "automotive",
107
+ "baby": "baby",
108
+ "beauty": "beauty",
109
+ "books": "stripbooks",
110
+ "clothing & accessories": "apparel",
111
+ "electronics": "electronics",
112
+ "gift cards": "gift-cards",
113
+ "grocery": "grocery",
114
+ "handmade": "handmade",
115
+ "health & personal care": "hpc",
116
+ "home & kitchen": "kitchen",
117
+ "industrial & scientific": "industrial",
118
+ "jewelry": "jewelry",
119
+ "kindle store": "digital-text",
120
+ "livres en français": "french-books",
121
+ "luggage & bags": "luggage",
122
+ "luxury beauty": "luxury-beauty",
123
+ "movies & tv": "dvd",
124
+ "music": "popular",
125
+ "musical instruments, stage & studio": "mi",
126
+ "office products": "office-products",
127
+ "patio, lawn & garden": "lawngarden",
128
+ "pet supplies": "pets",
129
+ "shoes & handbags": "shoes",
130
+ "software": "software",
131
+ "sports & outdoors": "sporting",
132
+ "tools & home improvement": "tools",
133
+ "toys & games": "toys",
134
+ "video games": "videogames",
135
+ "watches": "watches"
136
+ }
137
+ },
138
+ {
139
+ "country": "uk",
140
+ "categoryCodes": {
141
+ "all departments": "aps",
142
+ "alexa skills": "alexa-skills",
143
+ "amazon devices": "amazon-devices",
144
+ "amazon fresh": "amazonfresh",
145
+ "amazon global store": "amazon-global-store",
146
+ "amazon pantry": "pantry",
147
+ "amazon warehouse": "warehouse-deals",
148
+ "apps & games": "mobile-apps",
149
+ "baby": "baby",
150
+ "beauty": "beauty",
151
+ "books": "stripbooks",
152
+ "car & motorbike": "automotive",
153
+ "cds & vinyl": "popular",
154
+ "classical music": "classical",
155
+ "clothing": "clothing",
156
+ "computers & accessories": "computers",
157
+ "digital music ": "digital-music",
158
+ "diy & tools": "diy",
159
+ "dvd & blu-ray": "dvd",
160
+ "electronics & photo": "electronics",
161
+ "fashion": "fashion",
162
+ "garden & outdoors": "outdoor",
163
+ "gift cards": "gift-cards",
164
+ "grocery": "grocery",
165
+ "handmade": "handmade",
166
+ "health & personal care": "drugstore",
167
+ "home & business services": "local-services",
168
+ "home & kitchen": "kitchen",
169
+ "industrial & scientific": "industrial",
170
+ "jewellery": "jewelry",
171
+ "kindle store": "digital-text",
172
+ "large appliances": "appliances",
173
+ "lighting": "lighting",
174
+ "luggage": "luggage",
175
+ "luxury beauty": "luxury-beauty",
176
+ "musical instruments & dj": "mi",
177
+ "pc & video games": "videogames",
178
+ "pet supplies": "pets",
179
+ "prime video": "instant-video",
180
+ "shoes & bags": "shoes",
181
+ "software": "software",
182
+ "sports & outdoors": "sports",
183
+ "stationery & office supplies": "office-products",
184
+ "toys & games": "toys",
185
+ "vhs": "vhs",
186
+ "watches": "watches"
187
+ }
188
+ },
189
+ {
190
+ "country": "fr",
191
+ "categoryCodes": {
192
+ "toutes nos catégories": "aps",
193
+ "alexa skills": "alexa-skills",
194
+ "amazon offres reconditionnées": "warehouse-deals",
195
+ "amazon pantry": "pantry",
196
+ "animalerie": "pets",
197
+ "appareils amazon": "amazon-devices",
198
+ "applis & jeux": "mobile-apps",
199
+ "auto et moto": "automotive",
200
+ "bagages": "luggage",
201
+ "beauté et parfum": "beauty",
202
+ "beauté prestige": "luxury-beauty",
203
+ "bijoux": "jewelry",
204
+ "boutique chèques-cadeaux": "gift-cards",
205
+ "boutique kindle": "digital-text",
206
+ "bricolage": "diy",
207
+ "bébés & puériculture": "baby",
208
+ "chaussures et sacs": "shoes",
209
+ "cuisine & maison": "kitchen",
210
+ "dvd & blu-ray": "dvd",
211
+ "epicerie": "grocery",
212
+ "fournitures de bureau": "office-products",
213
+ "gros électroménager": "appliances",
214
+ "handmade": "handmade",
215
+ "high-tech": "electronics",
216
+ "hygiène et santé": "hpc",
217
+ "informatique": "computers",
218
+ "instruments de musique & sono": "mi",
219
+ "jardin": "garden",
220
+ "jeux et jouets": "toys",
221
+ "jeux vidéo": "videogames",
222
+ "livres anglais et étrangers": "english-books",
223
+ "livres en français": "stripbooks",
224
+ "logiciels": "software",
225
+ "luminaires et eclairage": "lighting",
226
+ "mode": "fashion",
227
+ "montres": "watches",
228
+ "musique : cd & vinyles": "popular",
229
+ "musique classique": "classical",
230
+ "secteur industriel & scientifique": "industrial",
231
+ "sports et loisirs": "sports",
232
+ "téléchargement de musique": "digital-music",
233
+ "vêtements et accessoires": "clothing"
234
+ }
235
+ },
236
+ {
237
+ "country": "de",
238
+ "categoryCodes": {
239
+ "alle kategorien": "aps",
240
+ "alexa skills": "alexa-skills",
241
+ "amazon fresh": "amazonfresh",
242
+ "amazon geräte": "amazon-devices",
243
+ "amazon global store": "amazon-global-store",
244
+ "amazon pantry": "pantry",
245
+ "amazon warehouse": "warehouse-deals",
246
+ "apps & spiele": "mobile-apps",
247
+ "audible hörbücher": "audible",
248
+ "auto & motorrad": "automotive",
249
+ "baby": "baby",
250
+ "baumarkt": "diy",
251
+ "beauty": "beauty",
252
+ "bekleidung": "clothing",
253
+ "beleuchtung": "lighting",
254
+ "bücher": "stripbooks",
255
+ "bücher (fremdsprachig)": "english-books",
256
+ "bürobedarf & schreibwaren": "office-products",
257
+ "computer & zubehör": "computers",
258
+ "drogerie & körperpflege": "drugstore",
259
+ "dvd & blu-ray": "dvd",
260
+ "elektro-großgeräte": "appliances",
261
+ "elektronik & foto": "electronics",
262
+ "fashion": "fashion",
263
+ "games": "videogames",
264
+ "garten": "outdoor",
265
+ "geschenkgutscheine": "gift-cards",
266
+ "gewerbe, industrie & wissenschaft": "industrial",
267
+ "handmade": "handmade",
268
+ "haustier": "pets",
269
+ "kamera & foto": "photo",
270
+ "kindle-shop": "digital-text",
271
+ "klassik": "classical",
272
+ "koffer, rucksäcke & taschen ": "luggage",
273
+ "küche, haushalt & wohnen": "kitchen",
274
+ "lebensmittel & getränke": "grocery",
275
+ "luxury beauty": "luxury-beauty",
276
+ "musik-cds & vinyl": "popular",
277
+ "musik-downloads": "digital-music",
278
+ "musikinstrumente & dj-equipment": "mi",
279
+ "prime video": "instant-video",
280
+ "schmuck": "jewelry",
281
+ "schuhe & handtaschen": "shoes",
282
+ "software": "software",
283
+ "spielzeug": "toys",
284
+ "sport & freizeit": "sports",
285
+ "uhren": "watches",
286
+ "zeitschriften": "magazines"
287
+ }
288
+ },
289
+ {
290
+ "country": "es",
291
+ "categoryCodes": {
292
+ "todos los departamentos": "aps",
293
+ "alexa skills": "alexa-skills",
294
+ "alimentación y bebidas": "grocery",
295
+ "amazon pantry": "pantry",
296
+ "appstore para android": "mobile-apps",
297
+ "bebé": "baby",
298
+ "belleza": "beauty",
299
+ "bricolaje y herramientas": "diy",
300
+ "cheques regalo": "gift-cards",
301
+ "coche - renting": "vehicles",
302
+ "coche y moto - piezas y accesorios": "automotive",
303
+ "deportes y aire libre": "sporting",
304
+ "dispositivos de amazon": "amazon-devices",
305
+ "electrónica": "electronics",
306
+ "equipaje": "luggage",
307
+ "grandes electrodomésticos": "appliances",
308
+ "handmade": "handmade",
309
+ "hogar y cocina": "kitchen",
310
+ "iluminación": "lighting",
311
+ "industria y ciencia": "industrial",
312
+ "informática": "computers",
313
+ "instrumentos musicales": "mi",
314
+ "jardín": "lawngarden",
315
+ "joyería": "jewelry",
316
+ "juguetes y juegos": "toys",
317
+ "libros": "stripbooks",
318
+ "moda": "fashion",
319
+ "música digital": "digital-music",
320
+ "música: cds y vinilos": "popular",
321
+ "oficina y papelería": "office-products",
322
+ "películas y tv": "dvd",
323
+ "productos para mascotas": "pets",
324
+ "productos reacondicionados": "warehouse-deals",
325
+ "relojes": "watches",
326
+ "ropa y accesorios": "apparel",
327
+ "salud y cuidado personal": "hpc",
328
+ "software": "software",
329
+ "tienda kindle": "digital-text",
330
+ "videojuegos": "videogames",
331
+ "zapatos y complementos": "shoes"
332
+ }
333
+ },
334
+ {
335
+ "country": "it",
336
+ "categoryCodes": {
337
+ "tutte le categorie": "aps",
338
+ "abbigliamento": "apparel",
339
+ "alexa skill": "alexa-skills",
340
+ "alimentari e cura della casa": "grocery",
341
+ "amazon pantry": "pantry",
342
+ "amazon warehouse": "warehouse-deals",
343
+ "app e giochi": "mobile-apps",
344
+ "auto e moto": "automotive",
345
+ "bellezza": "beauty",
346
+ "buoni regalo": "gift-cards",
347
+ "cancelleria e prodotti per ufficio": "office-products",
348
+ "casa e cucina": "kitchen",
349
+ "cd e vinili ": "popular",
350
+ "dispositivi amazon": "amazon-devices",
351
+ "elettronica": "electronics",
352
+ "fai da te": "diy",
353
+ "film e tv": "dvd",
354
+ "giardino e giardinaggio": "garden",
355
+ "giochi e giocattoli": "toys",
356
+ "gioielli": "jewelry",
357
+ "grandi elettrodomestici": "appliances",
358
+ "handmade": "handmade",
359
+ "illuminazione": "lighting",
360
+ "industria e scienza": "industrial",
361
+ "informatica": "computers",
362
+ "kindle store": "digital-text",
363
+ "libri": "stripbooks",
364
+ "moda": "fashion",
365
+ "musica digitale": "digital-music",
366
+ "orologi": "watches",
367
+ "prima infanzia": "baby",
368
+ "prodotti per animali domestici": "pets",
369
+ "salute e cura della persona": "hpc",
370
+ "scarpe e borse": "shoes",
371
+ "software": "software",
372
+ "sport e tempo libero": "sporting",
373
+ "strumenti musicali e dj": "mi",
374
+ "valigeria": "luggage",
375
+ "videogiochi": "videogames"
376
+ }
377
+ },
378
+ {
379
+ "country": "mx",
380
+ "categoryCodes": {
381
+ "todos los departamentos": "aps",
382
+ "auto": "automotive",
383
+ "bebé": "baby",
384
+ "dispositivos de amazon": "amazon-devices",
385
+ "electrónicos": "electronics",
386
+ "películas y series de tv": "dvd",
387
+ "tienda kindle": "digital-text",
388
+ "ropa, zapatos y accesorios": "fashion",
389
+ "   mujeres": "fashion-womens",
390
+ "   hombres": "fashion-mens",
391
+ "   niñas": "fashion-girls",
392
+ "   niños": "fashion-boys",
393
+ "   bebé": "fashion-baby",
394
+ "alexa skills": "alexa-skills",
395
+ "alimentos y bebidas": "grocery",
396
+ "deportes y aire libre": "sporting",
397
+ "herramientas y mejoras del hogar": "hi",
398
+ "hogar y cocina": "kitchen",
399
+ "industria y ciencia": "industrial",
400
+ "instrumentos musicales": "mi",
401
+ "juegos y juguetes": "toys",
402
+ "libros": "stripbooks",
403
+ "mascotas": "pets",
404
+ "música": "popular",
405
+ "oficina y papelería": "office-products",
406
+ "productos handmade": "handmade",
407
+ "salud, belleza y cuidado personal": "hpc",
408
+ "software": "software",
409
+ "videojuegos": "videogames"
410
+ }
411
+ },
412
+ {
413
+ "country": "in",
414
+ "categoryCodes": {
415
+ "all categories": "aps",
416
+ "alexa skills": "alexa-skills",
417
+ "amazon devices": "amazon-devices",
418
+ "amazon fashion": "fashion",
419
+ "amazon fresh": "nowstore",
420
+ "amazon global store": "amazon-global-store",
421
+ "amazon pantry": "pantry",
422
+ "appliances": "appliances",
423
+ "apps & games": "mobile-apps",
424
+ "baby": "baby",
425
+ "beauty": "beauty",
426
+ "books": "stripbooks",
427
+ "car & motorbike": "automotive",
428
+ "clothing & accessories": "apparel",
429
+ "collectibles": "collectibles",
430
+ "computers & accessories": "computers",
431
+ "electronics": "electronics",
432
+ "furniture": "furniture",
433
+ "garden & outdoors": "lawngarden",
434
+ "gift cards": "gift-cards",
435
+ "grocery & gourmet foods": "grocery",
436
+ "health & personal care": "hpc",
437
+ "home & kitchen": "kitchen",
438
+ "industrial & scientific": "industrial",
439
+ "jewellery": "jewelry",
440
+ "kindle store": "digital-text",
441
+ "luggage & bags": "luggage",
442
+ "luxury beauty": "luxury-beauty",
443
+ "movies & tv shows": "dvd",
444
+ "music": "popular",
445
+ "musical instruments": "mi",
446
+ "office products": "office-products",
447
+ "pet supplies": "pets",
448
+ "prime video": "instant-video",
449
+ "shoes & handbags": "shoes",
450
+ "software": "software",
451
+ "sports, fitness & outdoors": "sporting",
452
+ "tools & home improvement": "home-improvement",
453
+ "toys & games": "toys",
454
+ "video games": "videogames",
455
+ "watches": "watches"
456
+ }
457
+ },
458
+ {
459
+ "country": "jp",
460
+ "categoryCodes": {
461
+ "すべてのカテゴリー": "aps",
462
+ "amazon デバイス": "amazon-devices",
463
+ "kindleストア ": "digital-text",
464
+ "prime video": "instant-video",
465
+ "alexaスキル": "alexa-skills",
466
+ "デジタルミュージック": "digital-music",
467
+ "android アプリ": "mobile-apps",
468
+ "本": "stripbooks",
469
+ "洋書": "english-books",
470
+ "ミュージック": "popular",
471
+ "クラシック": "classical",
472
+ "dvd": "dvd",
473
+ "tvゲーム": "videogames",
474
+ "pcソフト": "software",
475
+ "パソコン・周辺機器": "computers",
476
+ "家電&カメラ": "electronics",
477
+ "文房具・オフィス用品": "office-products",
478
+ "ホーム&キッチン": "kitchen",
479
+ "ペット用品": "pets",
480
+ "ドラッグストア": "hpc",
481
+ "ビューティー": "beauty",
482
+ "ラグジュアリービューティー": "luxury-beauty",
483
+ "食品・飲料・お酒": "food-beverage",
484
+ "ベビー&マタニティ": "baby",
485
+ "ファッション": "fashion",
486
+ "レディース": "fashion-womens",
487
+ "メンズ": "fashion-mens",
488
+ "キッズ&ベビー": "fashion-baby-kids",
489
+ "服&ファッション小物": "apparel",
490
+ "シューズ&バッグ": "shoes",
491
+ "腕時計": "watch",
492
+ "ジュエリー": "jewelry",
493
+ "おもちゃ": "toys",
494
+ "ホビー": "hobby",
495
+ "楽器": "mi",
496
+ "スポーツ&アウトドア": "sporting",
497
+ "車&バイク": "automotive",
498
+ "diy・工具・ガーデン": "diy",
499
+ "大型家電": "appliances",
500
+ "クレジットカード": "financial",
501
+ "ギフト券": "gift-cards",
502
+ "産業・研究開発用品": "industrial",
503
+ "amazonパントリー": "pantry",
504
+ "amazonアウトレット": "warehouse-deals",
505
+ "ホーム&キッチン": "kitchen",
506
+ "ベビー&マタニティ": "baby",
507
+ "スポーツ&アウトドア": "sporting"
508
+ }
509
+ }
510
+ ]
511
+
512
+
513
+ class ListingParser:
514
+ def __init__(self, html):
515
+ self.html = html
516
+ self.d = pq(html)
517
+
518
+ def get_title(self):
519
+ return self.d('#productTitle').text()
520
+
521
+ def get_asin(self):
522
+ css_list = ['#ASIN']
523
+ return self.d(','.join(css_list)).val()
524
+
525
+ def get_brand(self):
526
+ css_list = ['#bylineInfo']
527
+ brand_info = self.d(','.join(css_list)).text()
528
+ pattern = re.compile(
529
+ 'by|from|Visit the|Brand:|Marke:|Besuchen Sie den|-Store|Store|Visita lo Store di |Visita la Store de |のストアを表示|Marca: |ブランド: |Besuche den|Visiter la boutique|Marque\xa0:|De|openen'
530
+ )
531
+ return re.sub(pattern, '', brand_info).strip()
532
+
533
+ def get_ships_from(self):
534
+ def judge_type(regex, text):
535
+ return bool(re.findall(regex, text, re.I | re.M))
536
+
537
+ css_list = ['#tabular-buybox', '#merchant-info', '#usedbuyBox']
538
+ merchant_info = self.d(','.join(css_list)).text()
539
+ # print(re.sub(r'\s+', ' ', merchant_info))
540
+ # 与JS的差异在于: "/" ==> "'", 分组是需要加上?:,即"(" ==> "(?:" "/gi" ==> "re.I"
541
+ amz_regex = '(?:(?:ships|dispatched)\s+from(?:\s|\S)+sold\s+by\s+Amazon)|(?:sold\s+by:*\s+Amazon)|(?:Expédié\s+et\s+vendu\s+par\s+Amazon)|(?:Verkauf\s+und\s+Versand\s+durch\s+Amazon)|(?:Vendido\s+y\s+enviado\s+por\s+Amazon)|(?:Venduto\s+e\s+spedito\s+da\s+Amazon)|(?:Amazon.co.jp\s+が販売、発送します。)|(?:Amazon.co.jp がフラストレーション・フリー・パッケージで販売、発送します)|(?:Envío\s+desde\s+Amazon\s+México)|(?:販売元\s+Amazon.co.jp)'
542
+ fba_regex = '(?:fulfilled|ships\s+by|from\s+Amazon)|(?:sold\s+by:)|(?:expédié\s+par\s+amazon)|(?:Versand\s+durch\s+Amazon)|(?:enviado\s+por\s+Amazon)|(?:gestionado\s+por\s+Amazon)|(?:spedito\s+da\s+Amazon)|(?:が販売し、Amazon.co.jp)|(?:Envío\s+desde\s+Amazon)|(?:出荷元\s+Amazon)'
543
+ merch_regex = '(:?(:?ships|dispatched)\s+from\s+and\s+sold\s+by)|(:?Sold\s+by)|(:?Expédié\s+et\s+vendu\s+par)|(:?Verkauf\s+und\s+Versand\s+durch)|(:?Vendido\s+y\s+enviado\s+por)|(:?Venduto\s+e\s+spedito\s+da)|(:?^(:?(:??!Amazon.co.jp).)*?\s+が販売、発送します。)|(:?Envío\s+desde)|(:?出荷元)|(?:Spedizione)'
544
+
545
+ amz = judge_type(amz_regex, merchant_info)
546
+ fba = judge_type(fba_regex, merchant_info)
547
+ merch = judge_type(merch_regex, merchant_info)
548
+
549
+ if amz:
550
+ ship_type = 'AMZ'
551
+ elif fba:
552
+ ship_type = 'FBA'
553
+ elif merch:
554
+ ship_type = 'FBM'
555
+ else:
556
+ ship_type = 'N.A.'
557
+
558
+ return ship_type
559
+
560
+ def get_sold_by(self):
561
+ css_list = ['#merchant-info > a.a-link-normal:nth-of-type(1)', '#tabular-buybox a#sellerProfileTriggerId']
562
+ merchant_info = self.d(','.join(css_list))
563
+ return merchant_info.text()
564
+
565
+ def get_bullet_point(self):
566
+ # 修改list为字符串格式,方便存储
567
+ css_list = ['#feature-bullets li:not([id]) span.a-list-item']
568
+ return '\n'.join([re.sub('\u200f|\u200e|\xa0', ' ', i.text).strip() for i in self.d(','.join(css_list))])
569
+
570
+ def get_price(self):
571
+ css_list = [
572
+ '#apex_desktop_newAccordionRow .priceToPay .a-offscreen',
573
+ '#apex_desktop > div > .priceToPay .a-offscreen',
574
+ '#apex_desktop > div > div > .priceToPay .a-offscreen',
575
+ '#apex_desktop > #corePrice_desktop .apexPriceToPay .a-offscreen',
576
+ '#apex_desktop #apex_desktop_newAccordionRow > #corePrice_desktop .apexPriceToPay .a-offscreen',
577
+ '#apex_desktop #apex_desktop_qualifiedBuybox #corePriceDisplay_desktop_feature_div span.priceToPay span[aria-hidden="true"]',
578
+ '#apex_desktop #apex_desktop_newAccordionRow #corePriceDisplay_desktop_feature_div span.priceToPay span[aria-hidden="true"]',
579
+ '#apex_desktop .priceToPay .a-offscreen',
580
+ '.priceToPay',
581
+ ]
582
+ price = ''
583
+ for css in css_list:
584
+ price = self.d(css).text()
585
+ if price:
586
+ break
587
+ return re.sub(r'\s', '', price)
588
+
589
+ def get_rating(self):
590
+ # 评分
591
+ css_list = ['#averageCustomerReviews_feature_div #acrPopover']
592
+ # 海象运算符
593
+ if rating_text := self.d(','.join(css_list)).attr('title'):
594
+ return re.sub('5つ星のうち|\xa0étoile\(s\)', '', rating_text.split(' ')[0]).replace(',', '.')
595
+ return ''
596
+
597
+ def get_rating_cnt(self):
598
+ # 实际上是global ratings
599
+ css_list = ['#averageCustomerReviews_feature_div span#acrCustomerReviewText']
600
+ pattern = re.compile(',|\.|\s|個の評価')
601
+ return re.sub(pattern, '', self.d(','.join(css_list)).text().split(' ')[0])
602
+
603
+ def get_qa(self):
604
+ return '0'
605
+
606
+ def get_monthly_sales(self):
607
+ month_sale = ''
608
+ month_sales_text = self.d('#social-proofing-faceout-title-tk_bought > span').text()
609
+ if month_sales_text:
610
+ slim = re.sub(
611
+ 'bought in past month|comprados el mes pasado|Mal im letzten Monat gekauft|achetés au cours du mois dernier|acquistati nel mese scorso|gekocht in de afgelopen maand|kupionych w ciągu ostatniego miesiąca|köpta under den senaste månaden|satın alındı|過去1か月で|点以上購入されました|Plus de|adetten fazla|Geçen ay |\+|\s',
612
+ '', month_sales_text)
613
+ month_sale = slim.replace('mil', '000').replace('k', '000').replace('K', '000').replace(' ', '')
614
+ return month_sale
615
+
616
+ def get_listing_date(self):
617
+ match_list = [
618
+ 'Release date',
619
+ 'Date First Available',
620
+ 'Date first available',
621
+ 'Date first listed on Amazon',
622
+ 'Date de mise en ligne sur Amazon.fr',
623
+ 'Im Angebot von Amazon.de seit',
624
+ 'Disponibile su Amazon.it a partire dal',
625
+ 'Fecha de disponibilidad en Amazon',
626
+ 'Producto en Amazon.com.mx desde',
627
+ 'Producto en Amazon.es desde',
628
+ 'Amazon.co.jp での取り扱い開始日',
629
+ 'Disponibile su Amazon.it a partire dal',
630
+ 'Datum eerste beschikbaarheid',
631
+ 'Date de mise en ligne sur Amazon.com.be'
632
+ ]
633
+ detail_dict = self.get_detail_dict()
634
+ for match_text in match_list:
635
+ tmp = detail_dict.get(match_text, '')
636
+ if tmp:
637
+ dt = dateparser.parse(tmp).date().__str__()
638
+ return dt
639
+ return ''
640
+
641
+ def get_variant(self):
642
+ variant_info = re.findall('"colorToAsin":(.*?),"refactorEnabled', self.html)
643
+ if variant_info:
644
+ variant_dict = json.loads(variant_info[0])
645
+ return len(variant_dict) if variant_dict else 1
646
+ return 1
647
+
648
+ def get_price_ped(self):
649
+ css_list = ['#primeExclusivePricingMessage > a#pep-signup-link > span:nth-of-type(2)']
650
+ return re.sub(r'\s', '', self.d(','.join(css_list)).text())
651
+
652
+ def is_promotion(self):
653
+ css_list = ['#applicable_promotion_list_sec']
654
+ return bool(self.d(','.join(css_list)))
655
+
656
+ def get_coupon(self):
657
+ info = self.d('label[id^="couponText"]')
658
+ if info:
659
+ # 去掉杂乱数据,获取价格或者百分比
660
+ tmp = re.sub('-|coupon|\xa0', '', info[0].text, flags=re.I).strip()
661
+ for i in tmp.split():
662
+ if re.findall(r'\d+', i):
663
+ return i
664
+ return ''
665
+
666
+ def get_video_url(self):
667
+ video_info = re.findall(r'var obj = A\.\$\.parseJSON\(\'(.*?)\'\);', self.html)
668
+ if not video_info:
669
+ return ''
670
+ url_info = re.findall(r'(https://m\.media-amazon\.com/images/S/.*?mp4)"', video_info[0])
671
+ return url_info[0] if url_info else ''
672
+
673
+ def get_main_img_url(self):
674
+ css_list = ['#imgTagWrapperId img']
675
+ return self.d(','.join(css_list)).attr('src')
676
+
677
+ def get_color(self):
678
+ css_list = ['.a-spacing-small.po-color td:nth-of-type(2)']
679
+ return self.d(','.join(css_list)).text()
680
+
681
+ # 规格:尺寸&重量
682
+ def get_dimension(self):
683
+ detail_dict = self.get_detail_dict()
684
+ dimension_match_list = ['Package Dimensions', 'Product Dimensions', 'Dimensiones del producto',
685
+ 'Dimensiones del paquete', 'Parcel Dimensions', 'Produktabmessungen',
686
+ 'Verpackungsabmessungen', 'Dimensioni prodotto', 'Dimensioni del collo',
687
+ 'Dimensions de l\'article L x L x H', 'Dimensions du produit (L x l x h)',
688
+ '製品サイズ', '梱包サイズ', 'Productafmetingen'
689
+ ]
690
+ for cur_bsr_match in dimension_match_list:
691
+ tmp = detail_dict.get(cur_bsr_match, '')
692
+ if tmp:
693
+ return tmp
694
+ return ''
695
+
696
+ def get_size(self):
697
+ dimension = self.get_dimension()
698
+ if dimension == '':
699
+ css_list = ['.a-spacing-small.po-item_dimensions td:nth-of-type(2)']
700
+ return self.d(','.join(css_list)).text()
701
+ elif len(dimension.split(';')) == 2:
702
+ return dimension.split(';')[0].strip()
703
+ else:
704
+ return dimension
705
+
706
+ def get_weight(self):
707
+ # con1 获取规格中的重量
708
+ dimension = self.get_dimension()
709
+ if len(dimension.split(';')) == 2:
710
+ return dimension.split(';')[-1].strip()
711
+
712
+ # con2 获取detail中的重量
713
+ detail_dict = self.get_detail_dict()
714
+ weight_match_list = ['Poids du produit', 'Item Weight']
715
+ for cur_bsr_match in weight_match_list:
716
+ tmp = detail_dict.get(cur_bsr_match, '')
717
+ if tmp:
718
+ return tmp
719
+
720
+ # con3 获取简介中的重量
721
+ css_list = ['.a-spacing-small.po-item_weight td:nth-of-type(2)']
722
+ return self.d(','.join(css_list)).text()
723
+
724
+ def get_manufacturer(self):
725
+ detail_dict = self.get_detail_dict()
726
+ manufacturer_match_list = ['Manufacturer', 'Fabricante', 'Fabricant', 'Hersteller', 'Produttore', 'メーカー',
727
+ 'Fabrikant']
728
+ for cur_bsr_match in manufacturer_match_list:
729
+ tmp = detail_dict.get(cur_bsr_match, '')
730
+ if tmp:
731
+ return tmp
732
+ return ''
733
+
734
+ def get_node_path(self):
735
+ css_list = ['#wayfinding-breadcrumbs_feature_div']
736
+ return self.d(','.join(css_list)).text().replace('\n', ' ')
737
+
738
+ # 跳板(逻辑来自jungle-scout,用来获取categoryCode以调用其销量接口)
739
+ def get_category_code_map(self):
740
+ d = {}
741
+ for i in self.d('#searchDropdownBox option').items():
742
+ d[i.text().lower()] = i.val().split('=')[-1].replace('-intl-ship', '')
743
+ return d
744
+
745
+ # 获取详情区域(table ul)
746
+ @lru_cache()
747
+ def get_detail_dict(self):
748
+ def clean(s):
749
+ return re.sub('\u200f|\u200e', '', s).strip()
750
+
751
+ css_list = [
752
+ '#prodDetails tr',
753
+ '#detailBullets_feature_div li',
754
+ '#detailBulletsWrapper_feature_div li',
755
+ ]
756
+ # 由于rating废代码较多,而且在其它地方已经获取,此方法只获取其它详情
757
+ exclude_review_pattern = re.compile('(?:reviews|media de los clientes)', re.I)
758
+ detail_dict = {}
759
+ for i in self.d(','.join(css_list)).items():
760
+ if i('th').text() and not re.findall(exclude_review_pattern, i('th').text()):
761
+ detail_dict[clean(i('th').text())] = clean(i('td').text())
762
+ if i('li').text():
763
+ each = i.text().split(':')
764
+ if len(each) == 2 and not re.findall(exclude_review_pattern, each[0]):
765
+ detail_dict[clean(each[0])] = clean(each[1])
766
+
767
+ return detail_dict
768
+
769
+ def get_bsr_list(self):
770
+ link_list = []
771
+ bsr_link_css_list = [
772
+ '#prodDetails tr a',
773
+ '#detailBullets_feature_div li a',
774
+ '#detailBulletsWrapper_feature_div li a',
775
+ ]
776
+ for i in self.d(','.join(bsr_link_css_list)).items():
777
+ link: str = i.attr('href')
778
+ if 'bestsellers' in link:
779
+ link_list.append(link)
780
+
781
+ bsr_match_list = [
782
+ 'Amazon Bestseller',
783
+ 'Amazon Bestsellers Rank',
784
+ 'Best Sellers Rank',
785
+ 'Best-sellers rank',
786
+ 'Amazon Bestseller-Rang',
787
+ "Classement des meilleures ventes d'Amazon",
788
+ 'Clasificación en los más vendidos de Amazon',
789
+ 'Posizione nella classifica Bestseller di Amazon',
790
+ 'Amazon 売れ筋ランキング',
791
+ 'Plaats in bestsellerlijst'
792
+ ]
793
+ detail_dict = self.get_detail_dict()
794
+ for cur_bsr_match in bsr_match_list:
795
+ bsr_info = detail_dict.get(cur_bsr_match, '')
796
+ if bsr_info:
797
+ li = []
798
+ for rank_and_category in bsr_info.split('\n'):
799
+ rank_and_category = re.sub('\(.*?\)|\#|Nr\.|\,|\.|nº|n\.', '', rank_and_category)
800
+ rank = re.findall('[\d,.]+', rank_and_category)[0].replace(',', '').replace('.', '')
801
+ cate = re.findall('(?:en|in|位|dans)(.*)', rank_and_category, re.M)[0].strip()
802
+ li.append({
803
+ 'category': cate,
804
+ 'rank': rank,
805
+ 'link': link_list.pop(0)
806
+ })
807
+ return li
808
+ return []
809
+
810
+ def get_category_code(self, category1):
811
+ blank_value = 'N.A.'
812
+ category_code_map = self.get_category_code_map()
813
+ category_code = category_code_map.get(category1.lower())
814
+ if category_code:
815
+ return category_code
816
+
817
+ selected_cate_code = self.d('#searchDropdownBox option:selected').val().split('=')[-1].replace('-intl-ship', '')
818
+ if selected_cate_code == 'aps':
819
+ # 如果选中的是aps,去其他国家找cate1是否有对应的分类,加入没找到就直接返回选中的
820
+ for categoryCodesForCountry in CATEGORY_CODES_BY_COUNTRY:
821
+ category_code = categoryCodesForCountry.get(category1.lower())
822
+ if category_code:
823
+ return category_code
824
+ else:
825
+ return selected_cate_code
826
+
827
+ return blank_value
828
+
829
+ def get_rank_and_category(self):
830
+ bsr_list = self.get_bsr_list()
831
+ if bsr_list:
832
+ cate1 = bsr_list[0]['category']
833
+ category_code = self.get_category_code(cate1)
834
+ return {
835
+ 'category': cate1,
836
+ 'rank': bsr_list[0]['rank'],
837
+ 'categoryCode': category_code,
838
+ }
839
+ return {}
840
+
841
+ def get_all(self):
842
+ return {
843
+ "asin": self.get_asin(),
844
+ "title": self.get_title(),
845
+ "bullet_point": self.get_bullet_point(),
846
+ "price": self.get_price(),
847
+ "price_ped": self.get_price_ped(),
848
+ "is_promotion": self.is_promotion(),
849
+ "coupon": self.get_coupon(),
850
+ "variant_cnt": self.get_variant(),
851
+ "main_img_url": self.get_main_img_url(),
852
+ "video_url": self.get_video_url(),
853
+ "node_path": self.get_node_path(),
854
+ "rating": self.get_rating(),
855
+ "rating_cnt": self.get_rating_cnt(),
856
+ "qa_cnt": self.get_qa(),
857
+ "monthly_sales": self.get_monthly_sales(),
858
+ "color": self.get_color(),
859
+ "size": self.get_size(),
860
+ "sold_by": self.get_sold_by(),
861
+ "ships_from": self.get_ships_from(),
862
+ "dimension": self.get_dimension(),
863
+ "manufacturer": self.get_manufacturer(),
864
+ "brand": self.get_brand(),
865
+ "weight": self.get_weight(),
866
+ "bsr_list": self.get_bsr_list(),
867
+ "listing_date": self.get_listing_date(),
868
+ "rank_and_category": self.get_rank_and_category(),
869
+ }
@@ -0,0 +1,252 @@
1
+ import re
2
+ from functools import lru_cache
3
+
4
+ from dateparser.search import search_dates
5
+ from pyquery import PyQuery as pq
6
+
7
+
8
+ class ReviewParser:
9
+ site_alias = {
10
+ "US": [
11
+ "United States",
12
+ "Vereinigten Staaten",
13
+ "美国",
14
+ "アメリカ合衆国",
15
+ "Estados Unidos",
16
+ "Estados Unidos",
17
+ "États-Unis",
18
+ "Stati Uniti"
19
+ ],
20
+ "JP": [
21
+ "Japan",
22
+ "Japan",
23
+ "日本",
24
+ "日本",
25
+ "Japón",
26
+ "Japão",
27
+ "Japon",
28
+ "Giappone"
29
+ ],
30
+ "DE": [
31
+ "Germany",
32
+ "Deutschland",
33
+ "德国",
34
+ "ドイツ",
35
+ "Alemania",
36
+ "Alemanha",
37
+ "Allemagne",
38
+ "Germania"
39
+ ],
40
+ "UK": [
41
+ "United Kingdom",
42
+ "Vereinigten Königreich",
43
+ "英国",
44
+ "英国",
45
+ "Reino Unido",
46
+ "Reino Unido",
47
+ "Royaume-Uni",
48
+ "Regno Unito"
49
+ ],
50
+ "FR": [
51
+ "France",
52
+ "Frankreich",
53
+ "法国",
54
+ "フランス",
55
+ "Francia",
56
+ "França",
57
+ "France",
58
+ "Francia"
59
+ ],
60
+ "IT": [
61
+ "Italy",
62
+ "Italien",
63
+ "意大利",
64
+ "イタリア",
65
+ "Italia",
66
+ "Itália",
67
+ "Italie",
68
+ "Italia"
69
+ ],
70
+ "ES": [
71
+ "Spain",
72
+ "Spanien",
73
+ "西班牙",
74
+ "スペイン",
75
+ "España",
76
+ "Espanha",
77
+ "Espagne",
78
+ "Spagna"
79
+ ],
80
+ "CA": [
81
+ "Canada",
82
+ "Kanada",
83
+ "加拿大",
84
+ "カナダ",
85
+ "Canadá",
86
+ "Canadá",
87
+ "Canada",
88
+ "Canada"
89
+ ],
90
+ "IN": [
91
+ "India",
92
+ "Indien",
93
+ "印度",
94
+ "インド",
95
+ "India",
96
+ "Índia",
97
+ "Inde",
98
+ "India"
99
+ ],
100
+ "MX": [
101
+ "Mexico",
102
+ "Mexiko",
103
+ "墨西哥",
104
+ "メキシコ",
105
+ "México",
106
+ "México",
107
+ "Mexique",
108
+ "Messico"
109
+ ],
110
+ "AU": [
111
+ "Australia",
112
+ "Australien",
113
+ "澳大利亚",
114
+ "オーストラリア",
115
+ "Australia",
116
+ "Austrália",
117
+ "Australie",
118
+ "Australia"
119
+ ],
120
+ "AE": [
121
+ "阿联酋"
122
+ ],
123
+ "NL": [
124
+ "Netherlands",
125
+ "Niederlanden",
126
+ "荷兰",
127
+ "オランダ",
128
+ "Países Bajos",
129
+ "Países Baixos",
130
+ "Pays-Bas",
131
+ "Paesi Bassi"
132
+ ],
133
+ "SE": [
134
+ "Sweden",
135
+ "瑞典"
136
+ ],
137
+ "SA": [
138
+ "Saudi Arabia",
139
+ "沙特阿拉伯",
140
+ ],
141
+ "SG": [
142
+ "Singapore",
143
+ "Singapur",
144
+ "新加坡",
145
+ "シンガポール",
146
+ "Singapur",
147
+ "Singapura",
148
+ "Singapour",
149
+ "Singapore"
150
+ ]
151
+ }
152
+ amazon_host_mapping = {
153
+ 'US': 'www.amazon.com',
154
+ 'UK': 'www.amazon.co.uk',
155
+ 'DE': 'www.amazon.de',
156
+ 'FR': 'www.amazon.fr',
157
+ 'IT': 'www.amazon.it',
158
+ 'JP': 'www.amazon.co.jp',
159
+ 'CA': 'www.amazon.ca',
160
+ 'MX': 'www.amazon.com.mx',
161
+ 'ES': 'www.amazon.es',
162
+ 'IN': 'www.amazon.in',
163
+ 'AU': 'www.amazon.com.au',
164
+ 'AE': 'www.amazon.ae',
165
+ 'NL': 'www.amazon.nl',
166
+ 'SE': 'www.amazon.se',
167
+ 'SA': 'www.amazon.com.sa',
168
+ 'SG': 'www.amazon.com.sg'
169
+ }
170
+
171
+ def __init__(self, html, asin=''):
172
+ self.html = html
173
+ self.d = pq(self.html.replace('\\n', '').replace("\n", '').replace('\\', ''))
174
+ self.asin = asin
175
+
176
+ @classmethod
177
+ @lru_cache()
178
+ def get_nations_pattern(cls):
179
+ """将所有的国家词汇放入一个pattern中"""
180
+ nation_li = []
181
+ for _, value in cls.site_alias.items():
182
+ nation_li.extend(value)
183
+ return '|'.join(set(nation_li))
184
+
185
+ def get_site(self, nation_name):
186
+ # 根据匹配到的国家信息获取其站点简称,以区分站点及获取评论链接
187
+ for site, alias in self.site_alias.items():
188
+ if nation_name in alias:
189
+ return site
190
+ return None
191
+
192
+ def get_country_and_date(self, dom):
193
+ # 获取评论的国家和日期
194
+ blank_value = 'N.A.'
195
+ country_and_date = dom('[data-hook="review-date"]').text()
196
+ # TODO: 各国语言配置可能不对
197
+ languages = ['de', 'en', 'fr', 'ja', 'it', 'zh', 'es']
198
+ comment_date_match = search_dates(country_and_date, languages=languages)
199
+ if comment_date_match:
200
+ comment_date = comment_date_match[0][1].strftime('%Y-%m-%d')
201
+ else:
202
+ comment_date = blank_value
203
+ country_info = re.findall(self.get_nations_pattern(), country_and_date)
204
+ if country_info:
205
+ # TODO: 此处国家匹配可能缺失,get_site
206
+ country = self.get_site(country_info[0])
207
+ else:
208
+ country = blank_value
209
+
210
+ return country, comment_date
211
+
212
+ def get_comment_qty(self):
213
+ text = self.d('#filter-info-section').text()
214
+ review_text = re.findall(r'([\d,.]+)', text)
215
+ if review_text:
216
+ return int(review_text[-1].replace(',', '').replace('.', ''))
217
+
218
+ def parse_all(self):
219
+ li = []
220
+ for each in self.d('div[id][data-hook="review"]').items():
221
+ comment_id = each.attr('id')
222
+ title = each('[data-hook="review-title"] > span').text()
223
+ content = each('span[data-hook="review-body"]').text()
224
+ vp = '是' if each('span[data-hook="avp-badge"]') else '否'
225
+ variant = each('a[data-hook="format-strip"]').text()
226
+ asin = re.findall('product-reviews/(.*?)/', each('a[data-hook="format-strip"]').attr('href'))[
227
+ 0] if variant else self.asin
228
+ rating = each('i[data-hook*="review-star-rating"]').text()[0]
229
+ helpful_info = re.findall(r'[\d,.]+', each('span[data-hook="helpful-vote-statement"]').text())
230
+ helpful = helpful_info[0] if helpful_info else 0
231
+ name = each('.a-profile-name').text()
232
+ country, comment_date = self.get_country_and_date(each)
233
+ # 修复因国家没解析出来而导致的评论链接错误
234
+ comment_url = 'https://{}/gp/customer-reviews/{}'.format(self.amazon_host_mapping[country],
235
+ comment_id) if country != 'N.A.' else ''
236
+
237
+ item = {
238
+ 'asin': asin,
239
+ 'comment_id': comment_id,
240
+ 'variant': variant,
241
+ 'name': name,
242
+ 'title': title,
243
+ 'content': content,
244
+ 'vp': vp,
245
+ 'rating': rating,
246
+ 'helpful': helpful,
247
+ 'comment_date': comment_date,
248
+ 'country': country,
249
+ 'comment_url': comment_url
250
+ }
251
+ li.append(item)
252
+ return li
@@ -0,0 +1,10 @@
1
+ Metadata-Version: 2.1
2
+ Name: amz-extractor
3
+ Version: 0.9.9
4
+ Summary: 提取亚马逊详情页和评论信息
5
+ Author: __token__
6
+ License: UNKNOWN
7
+ Platform: UNKNOWN
8
+
9
+ UNKNOWN
10
+
@@ -0,0 +1,9 @@
1
+ setup.py
2
+ amz_extractor/__init__.py
3
+ amz_extractor/listing_extractor.py
4
+ amz_extractor/review_extractor.py
5
+ amz_extractor.egg-info/PKG-INFO
6
+ amz_extractor.egg-info/SOURCES.txt
7
+ amz_extractor.egg-info/dependency_links.txt
8
+ amz_extractor.egg-info/requires.txt
9
+ amz_extractor.egg-info/top_level.txt
@@ -0,0 +1,2 @@
1
+ dateparser>=1.1.4
2
+ pyquery>=1.4.3
@@ -0,0 +1 @@
1
+ amz_extractor
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,20 @@
1
+ from distutils.core import setup
2
+
3
+ # setup(name='amz_parser',
4
+ # version='0.9.7',
5
+ # description='Extract useful data from Amazon pages.',
6
+ # author='lonely',
7
+ # packages=['amz_parser'],
8
+ # package_dir={'amz_parser': 'amz_parser'},
9
+ # install_requires=['dateparser>=1.1.4', 'pyquery>=1.4.3']
10
+ # )
11
+
12
+
13
+ setup(name='amz_extractor',
14
+ version='0.9.9',
15
+ description='提取亚马逊详情页和评论信息',
16
+ author='__token__',
17
+ packages=['amz_extractor'],
18
+ package_dir={'amz_extractor': 'amz_extractor'},
19
+ install_requires=['dateparser>=1.1.4', 'pyquery>=1.4.3']
20
+ )