scraper2-hj3415 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scraper2_hj3415/__init__.py +0 -0
- scraper2_hj3415/nfscrapy/__init__.py +0 -0
- scraper2_hj3415/nfscrapy/nfs/__init__.py +0 -0
- scraper2_hj3415/nfscrapy/nfs/items.py +34 -0
- scraper2_hj3415/nfscrapy/nfs/middlewares.py +103 -0
- scraper2_hj3415/nfscrapy/nfs/pipelines.py +26 -0
- scraper2_hj3415/nfscrapy/nfs/settings.py +100 -0
- scraper2_hj3415/nfscrapy/nfs/spiders/__init__.py +4 -0
- scraper2_hj3415/nfscrapy/nfs/spiders/c101.py +74 -0
- scraper2_hj3415/nfscrapy/run.py +160 -0
- scraper2_hj3415/nfscrapy/scrapy.cfg +11 -0
- scraper2_hj3415-0.0.1.dist-info/METADATA +26 -0
- scraper2_hj3415-0.0.1.dist-info/RECORD +15 -0
- scraper2_hj3415-0.0.1.dist-info/WHEEL +4 -0
- scraper2_hj3415-0.0.1.dist-info/licenses/LICENSE +19 -0
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# Define here the models for your scraped items
|
|
2
|
+
#
|
|
3
|
+
# See documentation in:
|
|
4
|
+
# https://docs.scrapy.org/en/latest/topics/items.html
|
|
5
|
+
|
|
6
|
+
import scrapy
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class C101items(scrapy.Item):
|
|
10
|
+
date = scrapy.Field()
|
|
11
|
+
종목명 = scrapy.Field()
|
|
12
|
+
코드 = scrapy.Field()
|
|
13
|
+
업종 = scrapy.Field()
|
|
14
|
+
EPS = scrapy.Field() # 주당순이익 EPS*PER = 주가
|
|
15
|
+
BPS = scrapy.Field() # 주당장부가치
|
|
16
|
+
PER = scrapy.Field() # 기업가치가 순이익에 몇배에 거래되는가
|
|
17
|
+
업종PER = scrapy.Field()
|
|
18
|
+
PBR = scrapy.Field()
|
|
19
|
+
배당수익률 = scrapy.Field()
|
|
20
|
+
|
|
21
|
+
주가 = scrapy.Field()
|
|
22
|
+
최고52주 = scrapy.Field()
|
|
23
|
+
최저52주 = scrapy.Field()
|
|
24
|
+
거래량 = scrapy.Field()
|
|
25
|
+
거래대금 = scrapy.Field()
|
|
26
|
+
시가총액 = scrapy.Field()
|
|
27
|
+
베타52주 = scrapy.Field()
|
|
28
|
+
발행주식 = scrapy.Field()
|
|
29
|
+
유통비율 = scrapy.Field()
|
|
30
|
+
|
|
31
|
+
#기업개요 파트
|
|
32
|
+
intro1 = scrapy.Field()
|
|
33
|
+
intro2 = scrapy.Field()
|
|
34
|
+
intro3 = scrapy.Field()
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
# Define here the models for your spider middleware
|
|
2
|
+
#
|
|
3
|
+
# See documentation in:
|
|
4
|
+
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
|
5
|
+
|
|
6
|
+
from scrapy import signals
|
|
7
|
+
|
|
8
|
+
# useful for handling different item types with a single interface
|
|
9
|
+
from itemadapter import is_item, ItemAdapter
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class NfsSpiderMiddleware:
|
|
13
|
+
# Not all methods need to be defined. If a method is not defined,
|
|
14
|
+
# scrapy acts as if the spider middleware does not modify the
|
|
15
|
+
# passed objects.
|
|
16
|
+
|
|
17
|
+
@classmethod
|
|
18
|
+
def from_crawler(cls, crawler):
|
|
19
|
+
# This method is used by Scrapy to create your spiders.
|
|
20
|
+
s = cls()
|
|
21
|
+
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
|
22
|
+
return s
|
|
23
|
+
|
|
24
|
+
def process_spider_input(self, response, spider):
|
|
25
|
+
# Called for each response that goes through the spider
|
|
26
|
+
# middleware and into the spider.
|
|
27
|
+
|
|
28
|
+
# Should return None or raise an exception.
|
|
29
|
+
return None
|
|
30
|
+
|
|
31
|
+
def process_spider_output(self, response, result, spider):
|
|
32
|
+
# Called with the results returned from the Spider, after
|
|
33
|
+
# it has processed the response.
|
|
34
|
+
|
|
35
|
+
# Must return an iterable of Request, or item objects.
|
|
36
|
+
for i in result:
|
|
37
|
+
yield i
|
|
38
|
+
|
|
39
|
+
def process_spider_exception(self, response, exception, spider):
|
|
40
|
+
# Called when a spider or process_spider_input() method
|
|
41
|
+
# (from other spider middleware) raises an exception.
|
|
42
|
+
|
|
43
|
+
# Should return either None or an iterable of Request or item objects.
|
|
44
|
+
pass
|
|
45
|
+
|
|
46
|
+
def process_start_requests(self, start_requests, spider):
|
|
47
|
+
# Called with the start requests of the spider, and works
|
|
48
|
+
# similarly to the process_spider_output() method, except
|
|
49
|
+
# that it doesn’t have a response associated.
|
|
50
|
+
|
|
51
|
+
# Must return only requests (not items).
|
|
52
|
+
for r in start_requests:
|
|
53
|
+
yield r
|
|
54
|
+
|
|
55
|
+
def spider_opened(self, spider):
|
|
56
|
+
spider.logger.info("Spider opened: %s" % spider.name)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class NfsDownloaderMiddleware:
|
|
60
|
+
# Not all methods need to be defined. If a method is not defined,
|
|
61
|
+
# scrapy acts as if the downloader middleware does not modify the
|
|
62
|
+
# passed objects.
|
|
63
|
+
|
|
64
|
+
@classmethod
|
|
65
|
+
def from_crawler(cls, crawler):
|
|
66
|
+
# This method is used by Scrapy to create your spiders.
|
|
67
|
+
s = cls()
|
|
68
|
+
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
|
69
|
+
return s
|
|
70
|
+
|
|
71
|
+
def process_request(self, request, spider):
|
|
72
|
+
# Called for each request that goes through the downloader
|
|
73
|
+
# middleware.
|
|
74
|
+
|
|
75
|
+
# Must either:
|
|
76
|
+
# - return None: continue processing this request
|
|
77
|
+
# - or return a Response object
|
|
78
|
+
# - or return a Request object
|
|
79
|
+
# - or raise IgnoreRequest: process_exception() methods of
|
|
80
|
+
# installed downloader middleware will be called
|
|
81
|
+
return None
|
|
82
|
+
|
|
83
|
+
def process_response(self, request, response, spider):
|
|
84
|
+
# Called with the response returned from the downloader.
|
|
85
|
+
|
|
86
|
+
# Must either;
|
|
87
|
+
# - return a Response object
|
|
88
|
+
# - return a Request object
|
|
89
|
+
# - or raise IgnoreRequest
|
|
90
|
+
return response
|
|
91
|
+
|
|
92
|
+
def process_exception(self, request, exception, spider):
|
|
93
|
+
# Called when a download handler or a process_request()
|
|
94
|
+
# (from other downloader middleware) raises an exception.
|
|
95
|
+
|
|
96
|
+
# Must either:
|
|
97
|
+
# - return None: continue processing this exception
|
|
98
|
+
# - return a Response object: stops process_exception() chain
|
|
99
|
+
# - return a Request object: stops process_exception() chain
|
|
100
|
+
pass
|
|
101
|
+
|
|
102
|
+
def spider_opened(self, spider):
|
|
103
|
+
spider.logger.info("Spider opened: %s" % spider.name)
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
from itemadapter import ItemAdapter
|
|
2
|
+
from pymongo import errors
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class MongoPipeline:
|
|
6
|
+
|
|
7
|
+
collection_name = 'c101'
|
|
8
|
+
|
|
9
|
+
def open_spider(self, spider):
|
|
10
|
+
self.client = spider.mongo_client
|
|
11
|
+
|
|
12
|
+
def process_item(self, item, spider):
|
|
13
|
+
print("In the MongoPipeline...", end="")
|
|
14
|
+
if self.client is None:
|
|
15
|
+
print("Skip the save to mongoDB.")
|
|
16
|
+
else:
|
|
17
|
+
print("Save data to mongoDB.")
|
|
18
|
+
my_collection = self.client[item['코드']][self.collection_name]
|
|
19
|
+
try:
|
|
20
|
+
my_collection.create_index('date', unique=True)
|
|
21
|
+
my_collection.insert_one(ItemAdapter(item).asdict())
|
|
22
|
+
except errors.DuplicateKeyError:
|
|
23
|
+
# 스크랩한 데이터가 이미 데이터베이스에 있을경우 지우고 저장
|
|
24
|
+
my_collection.delete_many({'date': {"$gte": item['date']}})
|
|
25
|
+
my_collection.insert_one(ItemAdapter(item).asdict())
|
|
26
|
+
return item
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
# Scrapy settings for nfs project
|
|
2
|
+
#
|
|
3
|
+
# For simplicity, this file contains only settings considered important or
|
|
4
|
+
# commonly used. You can find more settings consulting the documentation:
|
|
5
|
+
#
|
|
6
|
+
# https://docs.scrapy.org/en/latest/topics/settings.html
|
|
7
|
+
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
|
8
|
+
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
|
9
|
+
|
|
10
|
+
BOT_NAME = "nfs"
|
|
11
|
+
|
|
12
|
+
SPIDER_MODULES = ["nfs.spiders"]
|
|
13
|
+
NEWSPIDER_MODULE = "nfs.spiders"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
|
17
|
+
#USER_AGENT = "nfs (+http://www.yourdomain.com)"
|
|
18
|
+
|
|
19
|
+
# Obey robots.txt rules
|
|
20
|
+
ROBOTSTXT_OBEY = False
|
|
21
|
+
|
|
22
|
+
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
|
23
|
+
#CONCURRENT_REQUESTS = 32
|
|
24
|
+
|
|
25
|
+
# Configure a delay for requests for the same website (default: 0)
|
|
26
|
+
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
|
|
27
|
+
# See also autothrottle settings and docs
|
|
28
|
+
DOWNLOAD_DELAY = 3
|
|
29
|
+
RANDOMIZE_DOWNLOAD_DELAY = True
|
|
30
|
+
|
|
31
|
+
# The download delay setting will honor only one of:
|
|
32
|
+
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
|
|
33
|
+
#CONCURRENT_REQUESTS_PER_IP = 16
|
|
34
|
+
|
|
35
|
+
# Disable cookies (enabled by default)
|
|
36
|
+
COOKIES_ENABLED = False
|
|
37
|
+
|
|
38
|
+
# Disable Telnet Console (enabled by default)
|
|
39
|
+
#TELNETCONSOLE_ENABLED = False
|
|
40
|
+
|
|
41
|
+
# Override the default request headers:
|
|
42
|
+
#DEFAULT_REQUEST_HEADERS = {
|
|
43
|
+
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
44
|
+
# "Accept-Language": "en",
|
|
45
|
+
#}
|
|
46
|
+
|
|
47
|
+
# Enable or disable spider middlewares
|
|
48
|
+
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
|
49
|
+
#SPIDER_MIDDLEWARES = {
|
|
50
|
+
# "nfs.middlewares.NfsSpiderMiddleware": 543,
|
|
51
|
+
#}
|
|
52
|
+
|
|
53
|
+
# Enable or disable downloader middlewares
|
|
54
|
+
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
|
55
|
+
#DOWNLOADER_MIDDLEWARES = {
|
|
56
|
+
# "nfs.middlewares.NfsDownloaderMiddleware": 543,
|
|
57
|
+
#}
|
|
58
|
+
|
|
59
|
+
# Enable or disable extensions
|
|
60
|
+
# See https://docs.scrapy.org/en/latest/topics/extensions.html
|
|
61
|
+
#EXTENSIONS = {
|
|
62
|
+
# "scrapy.extensions.telnet.TelnetConsole": None,
|
|
63
|
+
#}
|
|
64
|
+
|
|
65
|
+
# Configure item pipelines
|
|
66
|
+
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
|
67
|
+
ITEM_PIPELINES = {
|
|
68
|
+
"nfs.pipelines.MongoPipeline": 300,
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
# Enable and configure the AutoThrottle extension (disabled by default)
|
|
72
|
+
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
|
|
73
|
+
#AUTOTHROTTLE_ENABLED = True
|
|
74
|
+
# The initial download delay
|
|
75
|
+
#AUTOTHROTTLE_START_DELAY = 5
|
|
76
|
+
# The maximum download delay to be set in case of high latencies
|
|
77
|
+
#AUTOTHROTTLE_MAX_DELAY = 60
|
|
78
|
+
# The average number of requests Scrapy should be sending in parallel to
|
|
79
|
+
# each remote server
|
|
80
|
+
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
|
81
|
+
# Enable showing throttling stats for every response received:
|
|
82
|
+
#AUTOTHROTTLE_DEBUG = False
|
|
83
|
+
|
|
84
|
+
# Enable and configure HTTP caching (disabled by default)
|
|
85
|
+
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
|
86
|
+
#HTTPCACHE_ENABLED = True
|
|
87
|
+
#HTTPCACHE_EXPIRATION_SECS = 0
|
|
88
|
+
#HTTPCACHE_DIR = "httpcache"
|
|
89
|
+
#HTTPCACHE_IGNORE_HTTP_CODES = []
|
|
90
|
+
#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
|
|
91
|
+
|
|
92
|
+
# Set settings whose default value is deprecated to a future-proof value
|
|
93
|
+
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
|
|
94
|
+
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
|
|
95
|
+
FEED_EXPORT_ENCODING = "utf-8"
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
LOG_ENABLED = True
|
|
100
|
+
LOG_LEVEL = 'WARNING'
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
import scrapy
|
|
2
|
+
from nfs import items
|
|
3
|
+
#from nfs.spiders import common
|
|
4
|
+
|
|
5
|
+
# cmd usage : scrapy crawl c101 -a codes=005930
|
|
6
|
+
|
|
7
|
+
# 여러 잡다한 기호를 없어거나 교체하는 람다함수
|
|
8
|
+
cleaning = lambda s: (
|
|
9
|
+
s.strip().replace('\t', '').replace('\r', '').replace('\n', '').replace(',', '').replace('원', '')
|
|
10
|
+
.replace('주', '').replace('억', '00000000').replace('%', '') if s is not None and s != 'N/A' else None
|
|
11
|
+
)
|
|
12
|
+
css_path = lambda i, j: f'#pArea>div.wrapper-table>div>table>tr:nth-child({i})>td>dl>dt:nth-child({j})>'
|
|
13
|
+
css_path2 = lambda i: f'#cTB11>tbody>tr:nth-child({i})>'
|
|
14
|
+
str_or_blank = lambda i: '' if i is None else str(i)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class C101Spider(scrapy.Spider):
|
|
18
|
+
name = 'c101'
|
|
19
|
+
allowed_domains = ['navercomp.wisereport.co.kr'] # https 주소
|
|
20
|
+
|
|
21
|
+
def __init__(self, codes, mongo_client, *args, **kwargs):
|
|
22
|
+
super(C101Spider, self).__init__(*args, **kwargs)
|
|
23
|
+
# 커맨드 라인에서 입력은 항상 문자열로만 받을수 있어서 리스트로 변환이 필요함.
|
|
24
|
+
if type(codes) == str:
|
|
25
|
+
self.codes = [codes, ]
|
|
26
|
+
elif type(codes) == list:
|
|
27
|
+
self.codes = codes
|
|
28
|
+
else:
|
|
29
|
+
raise TypeError
|
|
30
|
+
self.mongo_client = mongo_client
|
|
31
|
+
|
|
32
|
+
def start_requests(self):
|
|
33
|
+
total_count = len(self.codes)
|
|
34
|
+
print(f'Start scraping {self.name}, {total_count} items...')
|
|
35
|
+
self.logger.info(f'entire codes list - {self.codes}')
|
|
36
|
+
for i, one_code in enumerate(self.codes):
|
|
37
|
+
print(f'{i + 1}/{total_count}. Parsing {self.name}...{one_code}')
|
|
38
|
+
# reference from https://docs.scrapy.org/en/latest/topics/request-response.html
|
|
39
|
+
yield scrapy.Request(url=f'https://navercomp.wisereport.co.kr/v2/company/c1010001.aspx?cmp_cd={one_code}',
|
|
40
|
+
callback=self.parse_c101,
|
|
41
|
+
cb_kwargs=dict(code=one_code),
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
def parse_c101(self, response, code):
|
|
45
|
+
self.logger.debug(response.text)
|
|
46
|
+
item = items.C101items()
|
|
47
|
+
try:
|
|
48
|
+
item['date'] = response.xpath('//*[ @ id = "wrapper"]/div[1]/div[1]/div[1]/dl/dd[2]/p/text()')\
|
|
49
|
+
.get().replace('[기준:', '').replace(']', '')
|
|
50
|
+
except AttributeError:
|
|
51
|
+
self.logger.error(f'ERROR : Page not found...{code}')
|
|
52
|
+
return None
|
|
53
|
+
item['종목명'] = response.css(css_path(1, 1) + 'span::text').get()
|
|
54
|
+
item['코드'] = cleaning(response.css(css_path(1, 1) + 'b::text').get())
|
|
55
|
+
item['업종'] = response.css(css_path(1, 4).rstrip('>') + '::text').get().replace('WICS : ', '')
|
|
56
|
+
item['EPS'] = cleaning(response.css(css_path(3, 1) + 'b::text').get())
|
|
57
|
+
item['BPS'] = cleaning(response.css(css_path(3, 2) + 'b::text').get())
|
|
58
|
+
item['PER'] = cleaning(response.css(css_path(3, 3) + 'b::text').get())
|
|
59
|
+
item['업종PER'] = cleaning(response.css(css_path(3, 4) + 'b::text').get())
|
|
60
|
+
item['PBR'] = cleaning(response.css(css_path(3, 5) + 'b::text').get())
|
|
61
|
+
item['배당수익률'] = cleaning(response.css(css_path(3, 6) + 'b::text').get())
|
|
62
|
+
item['주가'] = cleaning(response.css(css_path2(1) + 'td>strong::text').get())
|
|
63
|
+
item['최고52주'], item['최저52주'] = map(cleaning, response.css(css_path2(2) + 'td::text').get().split('/'))
|
|
64
|
+
item['거래량'], item['거래대금'] = map(cleaning, response.css(css_path2(4) + 'td::text').get().split('/'))
|
|
65
|
+
item['시가총액'] = cleaning(response.css(css_path2(5) + 'td::text').get())
|
|
66
|
+
item['베타52주'] = cleaning(response.css(css_path2(6) + 'td::text').get())
|
|
67
|
+
item['발행주식'], item['유통비율'] = map(cleaning, response.css(css_path2(7) + 'td::text').get().split('/'))
|
|
68
|
+
|
|
69
|
+
item['intro1'] = str_or_blank(response.xpath('// *[ @ id = "wrapper"] / div[5] / div[2] / ul / li[1] / text()').get())
|
|
70
|
+
item['intro2'] = str_or_blank(response.xpath('// *[ @ id = "wrapper"] / div[5] / div[2] / ul / li[2] / text()').get())
|
|
71
|
+
item['intro3'] = str_or_blank(response.xpath('// *[ @ id = "wrapper"] / div[5] / div[2] / ul / li[3] / text()').get())
|
|
72
|
+
|
|
73
|
+
self.logger.info(item)
|
|
74
|
+
yield item
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import time
|
|
3
|
+
import pymongo
|
|
4
|
+
|
|
5
|
+
from scrapy.crawler import CrawlerProcess
|
|
6
|
+
from multiprocessing import Process, cpu_count
|
|
7
|
+
from scrapy.utils.project import get_project_settings
|
|
8
|
+
|
|
9
|
+
from util_hj3415 import utils
|
|
10
|
+
|
|
11
|
+
import logging
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
formatter = logging.Formatter('%(levelname)s: [%(name)s] %(message)s')
|
|
14
|
+
ch = logging.StreamHandler()
|
|
15
|
+
ch.setFormatter(formatter)
|
|
16
|
+
logger.addHandler(ch)
|
|
17
|
+
logger.setLevel(logging.ERROR)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def chcwd(func):
|
|
21
|
+
"""
|
|
22
|
+
scrapy는 항상 프로젝트 내부에서 실행해야 하기 때문에 일시적으로 현재 실행 경로를 변경해주는 목적의 데코레이션 함수
|
|
23
|
+
:param func:
|
|
24
|
+
:return:
|
|
25
|
+
"""
|
|
26
|
+
def wrapper(*args, **kwargs):
|
|
27
|
+
before_cwd = os.getcwd()
|
|
28
|
+
logger.info(f'current path : {before_cwd}')
|
|
29
|
+
after_cwd = os.path.dirname(os.path.realpath(__file__))
|
|
30
|
+
logger.info(f'change path to {after_cwd}')
|
|
31
|
+
os.chdir(after_cwd)
|
|
32
|
+
func(*args, **kwargs)
|
|
33
|
+
logger.info(f'restore path to {before_cwd}')
|
|
34
|
+
os.chdir(before_cwd)
|
|
35
|
+
return wrapper
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _run_scrapy(spider: str, codes: list, mongo_addr: str):
|
|
39
|
+
"""
|
|
40
|
+
scrapy 스파이더를 스크립트로 실행할 수 있는 함수
|
|
41
|
+
:param spider:
|
|
42
|
+
:param codes:
|
|
43
|
+
:param mongo_addr:
|
|
44
|
+
:return:
|
|
45
|
+
"""
|
|
46
|
+
if mongo_addr == "":
|
|
47
|
+
mongo_client = None
|
|
48
|
+
else:
|
|
49
|
+
mongo_client = connect_mongo(mongo_addr)
|
|
50
|
+
|
|
51
|
+
process = CrawlerProcess(get_project_settings())
|
|
52
|
+
process.crawl(spider, codes=codes, mongo_client=mongo_client)
|
|
53
|
+
process.start()
|
|
54
|
+
|
|
55
|
+
if mongo_client is not None:
|
|
56
|
+
mongo_client.close()
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _code_divider(entire_codes: list) -> tuple:
|
|
60
|
+
"""
|
|
61
|
+
전체 종목 코드를 리스트로 넣으면 cpu 코어에 맞춰 나눠 준다.
|
|
62
|
+
reference from https://stackoverflow.com/questions/19086106/how-to-utilize-all-cores-with-python-multiprocessing
|
|
63
|
+
:param entire_codes:
|
|
64
|
+
:return:
|
|
65
|
+
"""
|
|
66
|
+
def _split_list(alist, wanted_parts=1):
|
|
67
|
+
"""
|
|
68
|
+
멀티프로세싱할 갯수로 리스트를 나눈다.
|
|
69
|
+
reference from https://www.it-swarm.dev/ko/python/%EB%8D%94-%EC%9E%91%EC%9D%80-%EB%AA%A9%EB%A1%9D%EC%9C%BC%EB%
|
|
70
|
+
A1%9C-%EB%B6%84%ED%95%A0-%EB%B0%98%EC%9C%BC%EB%A1%9C-%EB%B6%84%ED%95%A0/957910776/
|
|
71
|
+
:param alist:
|
|
72
|
+
:param wanted_parts:
|
|
73
|
+
:return:
|
|
74
|
+
"""
|
|
75
|
+
length = len(alist)
|
|
76
|
+
return [alist[i * length // wanted_parts: (i + 1) * length // wanted_parts]
|
|
77
|
+
for i in range(wanted_parts)]
|
|
78
|
+
|
|
79
|
+
core = cpu_count()
|
|
80
|
+
print(f'Get number of core for multiprocessing : {core}')
|
|
81
|
+
n = core - 1
|
|
82
|
+
if len(entire_codes) < n:
|
|
83
|
+
n = len(entire_codes)
|
|
84
|
+
print(f'Split total {len(entire_codes)} codes by {n} parts ...')
|
|
85
|
+
divided_list = _split_list(entire_codes, wanted_parts=n)
|
|
86
|
+
return n, divided_list
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _mp_c10168(page: str, codes: list, mongo_addr: str):
|
|
90
|
+
"""
|
|
91
|
+
전체 코드를 코어수 대로 나눠서 멀티 프로세싱 시행
|
|
92
|
+
reference from https://monkey3199.github.io/develop/python/2018/12/04/python-pararrel.html
|
|
93
|
+
|
|
94
|
+
멀티프로세싱시 mongoclient를 만들어서 호출하는 방식은 에러가 발생하니 각 프로세스에서 개별적으로 생성해야한다.
|
|
95
|
+
referred from https://blog.naver.com/PostView.nhn?blogId=stop2y&logNo=222211823932&categoryNo=136&parentCategoryNo=
|
|
96
|
+
0&viewDate=¤tPage=1&postListTopCurrentPage=1&from=postView
|
|
97
|
+
|
|
98
|
+
:param page:
|
|
99
|
+
:param codes:
|
|
100
|
+
:param mongo_addr:
|
|
101
|
+
:return:
|
|
102
|
+
"""
|
|
103
|
+
|
|
104
|
+
if page not in ('c101', 'c106', 'c108'):
|
|
105
|
+
raise NameError
|
|
106
|
+
print('*' * 25, f"Scrape multiprocess {page.capitalize()}", '*' * 25)
|
|
107
|
+
print(f'Total {len(codes)} items..')
|
|
108
|
+
logger.info(codes)
|
|
109
|
+
n, divided_list = _code_divider(codes)
|
|
110
|
+
|
|
111
|
+
start_time = time.time()
|
|
112
|
+
ths = []
|
|
113
|
+
error = False
|
|
114
|
+
for i in range(n):
|
|
115
|
+
ths.append(Process(target=_run_scrapy, args=(page, divided_list[i], mongo_addr)))
|
|
116
|
+
for i in range(n):
|
|
117
|
+
ths[i].start()
|
|
118
|
+
for i in range(n):
|
|
119
|
+
ths[i].join()
|
|
120
|
+
print(f'Total spent time : {round(time.time() - start_time, 2)} sec.')
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def connect_mongo(addr: str, timeout=5) -> pymongo.MongoClient:
|
|
124
|
+
"""
|
|
125
|
+
몽고 클라이언트를 만들어주는 함수.
|
|
126
|
+
필요할 때마다 클라이언트를 생성하는 것보다 클라이언트 한개로 데이터베이스를 다루는게 효율적이라 함수를 따로 뺐음.
|
|
127
|
+
resolve conn error - https://stackoverflow.com/questions/54484890/ssl-handshake-issue-with-pymongo-on-python3
|
|
128
|
+
:param addr:
|
|
129
|
+
:param timeout:
|
|
130
|
+
:return:
|
|
131
|
+
"""
|
|
132
|
+
import certifi
|
|
133
|
+
ca = certifi.where()
|
|
134
|
+
if addr.startswith('mongodb://'):
|
|
135
|
+
# set a some-second connection timeout
|
|
136
|
+
client = pymongo.MongoClient(addr, serverSelectionTimeoutMS=timeout * 1000)
|
|
137
|
+
elif addr.startswith('mongodb+srv://'):
|
|
138
|
+
client = pymongo.MongoClient(addr, serverSelectionTimeoutMS=timeout * 1000, tlsCAFile=ca)
|
|
139
|
+
else:
|
|
140
|
+
raise Exception(f"Invalid address: {addr}")
|
|
141
|
+
try:
|
|
142
|
+
srv_info = client.server_info()
|
|
143
|
+
conn_str = f"Connect to Mongo Atlas v{srv_info['version']}..."
|
|
144
|
+
print(conn_str, f"Server Addr : {addr}")
|
|
145
|
+
return client
|
|
146
|
+
except Exception:
|
|
147
|
+
conn_str = f"Unable to connect to the server.(MY IP : {utils.get_ip_addr()})"
|
|
148
|
+
raise Exception(f"{conn_str} Server Addr : {addr}")
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
@chcwd
|
|
152
|
+
def c101(codes: list, mongo_addr: str = ""):
|
|
153
|
+
"""
|
|
154
|
+
c101을 외부에서 실행할 수 있는 함수
|
|
155
|
+
:param codes: 종목코드 리스트
|
|
156
|
+
:param mongo_addr: 몽고데이터베이스 URI - mongodb://...
|
|
157
|
+
:return:
|
|
158
|
+
"""
|
|
159
|
+
_mp_c10168('c101', codes=codes, mongo_addr=mongo_addr)
|
|
160
|
+
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
# Automatically created by: scrapy startproject
|
|
2
|
+
#
|
|
3
|
+
# For more information about the [deploy] section see:
|
|
4
|
+
# https://scrapyd.readthedocs.io/en/latest/deploy.html
|
|
5
|
+
|
|
6
|
+
[settings]
|
|
7
|
+
default = nfs.settings
|
|
8
|
+
|
|
9
|
+
[deploy]
|
|
10
|
+
#url = http://localhost:6800/
|
|
11
|
+
project = nfs
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: scraper2_hj3415
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: Gathering the stock data
|
|
5
|
+
Project-URL: Homepage, https://pypi.org/project/scraper2_hj3415/
|
|
6
|
+
Project-URL: Bug Tracker, https://pypi.org/project/scraper2_hj3415/
|
|
7
|
+
Author-email: Hyungjin Kim <hj3415@gmail.com>
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Requires-Python: >=3.7
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
|
|
15
|
+
# scraper2_hj3415 Package
|
|
16
|
+
|
|
17
|
+
This is a simple example package. You can use
|
|
18
|
+
[Github-flavored Markdown](https://guides.github.com/features/mastering-markdown/)
|
|
19
|
+
to write your content.
|
|
20
|
+
|
|
21
|
+
새로운 scrapy 프로젝트를 만들 때는 프로젝트 가장 외부폴더명과 그 안쪽 폴더명이 달라야한다. 안그러면 setting파일을 찾을수 없다는 에러가 발생한다.
|
|
22
|
+
(ex - nfscrapy/nfs(프로젝트명은 nfs로 하고 외부 폴더명을 nfscrapy로 변경))
|
|
23
|
+
|
|
24
|
+
[Tutorials - How to packaging](https://packaging.python.org/en/latest/tutorials/packaging-projects/)
|
|
25
|
+
|
|
26
|
+
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
scraper2_hj3415/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
scraper2_hj3415/nfscrapy/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
+
scraper2_hj3415/nfscrapy/run.py,sha256=G4UVhG0LYEdljiC7hbKCsiFVHSixmU3cOQEI0pM6Xso,5768
|
|
4
|
+
scraper2_hj3415/nfscrapy/scrapy.cfg,sha256=yCkEgpzAwc9NWjYGaEUelGdLg3mUuuQF1Zl0k5vITH8,260
|
|
5
|
+
scraper2_hj3415/nfscrapy/nfs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
+
scraper2_hj3415/nfscrapy/nfs/items.py,sha256=zKTQWz-IrA_MLY86oVExFhbFq-bgBEHMFlmfZL_CwGA,1017
|
|
7
|
+
scraper2_hj3415/nfscrapy/nfs/middlewares.py,sha256=F3kL0OJMsUhiw2mPIxQLGvY3wdMxIsQl1BG2uwo_ZHo,3745
|
|
8
|
+
scraper2_hj3415/nfscrapy/nfs/pipelines.py,sha256=gMM_Jfm7dbAfaTux1YvfbJ34vF-BBoIq6QZqX9wN0Uc,989
|
|
9
|
+
scraper2_hj3415/nfscrapy/nfs/settings.py,sha256=UsmKxrDTNboO10CzeFi3IhNiTpiKg-gvNySSIlNiogQ,3426
|
|
10
|
+
scraper2_hj3415/nfscrapy/nfs/spiders/__init__.py,sha256=ULwecZkx3_NTphkz7y_qiazBeUoHFnCCWnKSjoDCZj0,161
|
|
11
|
+
scraper2_hj3415/nfscrapy/nfs/spiders/c101.py,sha256=WKqLQ5EzJTnH1uozyhO7j8Invv4kBy0JCFy4nemd4PI,4233
|
|
12
|
+
scraper2_hj3415-0.0.1.dist-info/METADATA,sha256=EQ_vUTFP3ayPQFr510jgVchhwNWQtkVu78sCGFuZShU,1082
|
|
13
|
+
scraper2_hj3415-0.0.1.dist-info/WHEEL,sha256=Fd6mP6ydyRguakwUJ05oBE7fh2IPxgtDN9IwHJ9OqJQ,87
|
|
14
|
+
scraper2_hj3415-0.0.1.dist-info/licenses/LICENSE,sha256=6kbiFSfobTZ7beWiKnHpN902HgBx-Jzgcme0SvKqhKY,1091
|
|
15
|
+
scraper2_hj3415-0.0.1.dist-info/RECORD,,
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
Copyright (c) 2018 The Python Packaging Authority
|
|
2
|
+
|
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
4
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
5
|
+
in the Software without restriction, including without limitation the rights
|
|
6
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
7
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
8
|
+
furnished to do so, subject to the following conditions:
|
|
9
|
+
|
|
10
|
+
The above copyright notice and this permission notice shall be included in all
|
|
11
|
+
copies or substantial portions of the Software.
|
|
12
|
+
|
|
13
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
14
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
15
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
16
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
17
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
18
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
19
|
+
SOFTWARE.
|