scraper2-hj3415 0.0.2__py3-none-any.whl → 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scraper2_hj3415/cli.py +40 -3
- scraper2_hj3415/miscrapy/mi/__init__.py +0 -0
- scraper2_hj3415/miscrapy/mi/items.py +7 -0
- scraper2_hj3415/miscrapy/mi/middlewares.py +103 -0
- scraper2_hj3415/miscrapy/mi/pipelines.py +39 -0
- scraper2_hj3415/miscrapy/mi/settings.py +103 -0
- scraper2_hj3415/miscrapy/mi/spiders/__init__.py +4 -0
- scraper2_hj3415/miscrapy/mi/spiders/aud.py +23 -0
- scraper2_hj3415/miscrapy/mi/spiders/chf.py +25 -0
- scraper2_hj3415/miscrapy/mi/spiders/gbond3y.py +24 -0
- scraper2_hj3415/miscrapy/mi/spiders/gold.py +25 -0
- scraper2_hj3415/miscrapy/mi/spiders/kosdaq.py +23 -0
- scraper2_hj3415/miscrapy/mi/spiders/kospi.py +23 -0
- scraper2_hj3415/miscrapy/mi/spiders/mihistory.py +333 -0
- scraper2_hj3415/miscrapy/mi/spiders/silver.py +25 -0
- scraper2_hj3415/miscrapy/mi/spiders/sp500.py +24 -0
- scraper2_hj3415/miscrapy/mi/spiders/usdidx.py +30 -0
- scraper2_hj3415/miscrapy/mi/spiders/usdkrw.py +24 -0
- scraper2_hj3415/miscrapy/mi/spiders/wti.py +25 -0
- scraper2_hj3415/miscrapy/run.py +156 -0
- scraper2_hj3415/miscrapy/scrapy.cfg +11 -0
- scraper2_hj3415/nfscrapy/nfs/items.py +24 -0
- scraper2_hj3415/nfscrapy/nfs/pipelines.py +94 -16
- scraper2_hj3415/nfscrapy/nfs/settings.py +2 -1
- scraper2_hj3415/nfscrapy/nfs/spiders/_c108.py +95 -0
- scraper2_hj3415/nfscrapy/nfs/spiders/c101.py +3 -11
- scraper2_hj3415/nfscrapy/nfs/spiders/c103.py +174 -0
- scraper2_hj3415/nfscrapy/nfs/spiders/c104.py +228 -0
- scraper2_hj3415/nfscrapy/nfs/spiders/c106.py +92 -0
- scraper2_hj3415/nfscrapy/nfs/spiders/common.py +60 -0
- scraper2_hj3415/nfscrapy/run.py +76 -26
- {scraper2_hj3415-0.0.2.dist-info → scraper2_hj3415-0.1.0.dist-info}/METADATA +1 -1
- scraper2_hj3415-0.1.0.dist-info/RECORD +42 -0
- scraper2_hj3415-0.1.0.dist-info/entry_points.txt +2 -0
- scraper2_hj3415-0.0.2.dist-info/RECORD +0 -17
- scraper2_hj3415-0.0.2.dist-info/entry_points.txt +0 -2
- {scraper2_hj3415-0.0.2.dist-info → scraper2_hj3415-0.1.0.dist-info}/WHEEL +0 -0
- {scraper2_hj3415-0.0.2.dist-info → scraper2_hj3415-0.1.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,26 +1,104 @@
|
|
|
1
1
|
from itemadapter import ItemAdapter
|
|
2
2
|
from pymongo import errors
|
|
3
|
+
from pprint import pprint
|
|
4
|
+
from util_hj3415 import mongo
|
|
3
5
|
|
|
6
|
+
import logging
|
|
7
|
+
logger = logging.getLogger(__name__)
|
|
8
|
+
formatter = logging.Formatter('%(levelname)s: [%(name)s] %(message)s')
|
|
9
|
+
ch = logging.StreamHandler()
|
|
10
|
+
ch.setFormatter(formatter)
|
|
11
|
+
logger.addHandler(ch)
|
|
12
|
+
logger.setLevel(logging.DEBUG)
|
|
4
13
|
|
|
5
|
-
class MongoPipeline:
|
|
6
14
|
|
|
7
|
-
|
|
15
|
+
class ValidationPipeline:
|
|
16
|
+
def process_item(self, item, spider):
|
|
17
|
+
print(f"\tIn the {self.__class__.__name__}...", end="")
|
|
18
|
+
if spider.name == 'c101':
|
|
19
|
+
print(f" Manual calculating EPS, BPS, PER, PBR")
|
|
20
|
+
logger.debug('*** Start c101 pipeline ***')
|
|
21
|
+
logger.debug(f"Raw data - EPS:{item['EPS']} BPS:{item['BPS']} PER:{item['PER']} PBR:{item['PBR']}")
|
|
22
|
+
# eps, bps, per, pbr을 직접 계산해서 바꾸기 위해 c104 page를 찾는다.
|
|
23
|
+
try:
|
|
24
|
+
logger.debug('Try to get c104 page for calculate values..')
|
|
25
|
+
c104 = mongo.C104(spider.mongo_client, item['코드'], 'c104q')
|
|
26
|
+
d, eps = c104.sum_recent_4q('EPS') # 최근 4분기 eps값을 더한다.
|
|
27
|
+
d, bps = c104.latest_value('BPS') # 마지막 분기 bps값을 찾는다.
|
|
8
28
|
|
|
9
|
-
|
|
10
|
-
|
|
29
|
+
# per, pbr을 구하는 람다함수
|
|
30
|
+
cal_ratio = (lambda eps_bps, pprice:
|
|
31
|
+
None if eps_bps is None or eps_bps == 0 else round(int(pprice) / int(eps_bps), 2))
|
|
32
|
+
cal_per = cal_ratio(eps, item['주가'])
|
|
33
|
+
cal_pbr = cal_ratio(bps, item['주가'])
|
|
34
|
+
logger.debug(f"After calc data - EPS:{eps} BPS:{bps} PER:{cal_per} PBR:{cal_pbr}")
|
|
35
|
+
logger.debug(f"*** End c101 calculation pipeline {item['코드']} ***")
|
|
36
|
+
except:
|
|
37
|
+
logger.warning("Error on calculating custom EPS, BPS, PER, PBR, maybe DB hasn't c104q collection.")
|
|
38
|
+
logger.warning(
|
|
39
|
+
f"We will use default scraped values - EPS:{item['EPS']} BPS:{item['BPS']} PER:{item['PER']} PBR:{item['PBR']}")
|
|
40
|
+
return item
|
|
41
|
+
item['EPS'], item['BPS'], item['PER'], item['PBR'] = eps, bps, cal_per, cal_pbr
|
|
42
|
+
if 'c103' in spider.name:
|
|
43
|
+
# pprint(item)
|
|
44
|
+
print(" Nothing special working")
|
|
45
|
+
if 'c104' in spider.name:
|
|
46
|
+
#pprint(item)
|
|
47
|
+
print(" Nothing special working")
|
|
48
|
+
if spider.name == 'c106':
|
|
49
|
+
pprint(item)
|
|
50
|
+
print(" Nothing special working")
|
|
51
|
+
return item
|
|
11
52
|
|
|
53
|
+
|
|
54
|
+
class MongoPipeline:
|
|
12
55
|
def process_item(self, item, spider):
|
|
13
|
-
print("
|
|
14
|
-
if
|
|
15
|
-
print("
|
|
56
|
+
print(f"\tIn the {self.__class__.__name__}...", end="")
|
|
57
|
+
if spider.mongo_client is None:
|
|
58
|
+
print(f"Skipping to save the data... code : {item['코드']} / spider : {spider.name}")
|
|
16
59
|
else:
|
|
17
|
-
print("
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
60
|
+
print(f"Saving the {spider.name} to mongoDB...", end="")
|
|
61
|
+
if spider.name == 'c101':
|
|
62
|
+
page = spider.name
|
|
63
|
+
print(f" code : {item['코드']} / page : {page}")
|
|
64
|
+
c101_col = spider.mongo_client[item['코드']][page]
|
|
65
|
+
try:
|
|
66
|
+
c101_col.create_index('date', unique=True)
|
|
67
|
+
c101_col.insert_one(ItemAdapter(item).asdict())
|
|
68
|
+
except errors.DuplicateKeyError:
|
|
69
|
+
# 스크랩한 데이터가 이미 데이터베이스에 있을경우 지우고 저장
|
|
70
|
+
c101_col.delete_many({'date': {"$gte": item['date']}})
|
|
71
|
+
c101_col.insert_one(ItemAdapter(item).asdict())
|
|
72
|
+
elif 'c103' in spider.name:
|
|
73
|
+
page = ''.join(['c103', item['title']])
|
|
74
|
+
print(f" code : {item['코드']} / page : {page}")
|
|
75
|
+
#logging.debug(item['df'].to_dict('records'))
|
|
76
|
+
#r = mongo2.C103(self.client, item['코드'], page).save(item['df'])
|
|
77
|
+
elif 'c104' in spider.name:
|
|
78
|
+
if item['title'].endswith('y'):
|
|
79
|
+
page = 'c104y'
|
|
80
|
+
elif item['title'].endswith('q'):
|
|
81
|
+
page = 'c104q'
|
|
82
|
+
else:
|
|
83
|
+
raise ValueError
|
|
84
|
+
print(f" code : {item['코드']} / page : {page}({item['title']})")
|
|
85
|
+
logging.debug(item['df'].to_dict('records'))
|
|
86
|
+
r = mongo.C104(spider.mongo_client, item['코드'], page).save_df(item['df'])
|
|
87
|
+
elif spider.name == 'c106':
|
|
88
|
+
page = ''.join(['c106', item['title']])
|
|
89
|
+
print(f" code : {item['코드']} / page : {page}")
|
|
90
|
+
#logging.debug(item['df'].to_dict('records'))
|
|
91
|
+
#if page == 'c106y':
|
|
92
|
+
# r = mongo2.C106Y(self.client, item['코드']).save(item['df'])
|
|
93
|
+
#elif page == 'c106q':
|
|
94
|
+
# r = mongo2.C106Q(self.client, item['코드']).save(item['df'])
|
|
95
|
+
#else:
|
|
96
|
+
# raise
|
|
97
|
+
"""
|
|
98
|
+
elif spider.name == 'c108':
|
|
99
|
+
page = spider.name
|
|
100
|
+
print(f" code : {item['코드']} / page : {page}")
|
|
101
|
+
logging.debug(item['df'].to_dict('records'))
|
|
102
|
+
r = mongo2.C108(self.client, item['코드']).save(item['df'])
|
|
103
|
+
"""
|
|
26
104
|
return item
|
|
@@ -65,7 +65,8 @@ COOKIES_ENABLED = False
|
|
|
65
65
|
# Configure item pipelines
|
|
66
66
|
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
|
67
67
|
ITEM_PIPELINES = {
|
|
68
|
-
"nfs.pipelines.
|
|
68
|
+
"nfs.pipelines.ValidationPipeline": 300,
|
|
69
|
+
"nfs.pipelines.MongoPipeline": 400,
|
|
69
70
|
}
|
|
70
71
|
|
|
71
72
|
# Enable and configure the AutoThrottle extension (disabled by default)
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
import scrapy
|
|
2
|
+
import pandas as pd
|
|
3
|
+
|
|
4
|
+
from nfs import items
|
|
5
|
+
from nfs.spiders import common
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class C108Spider(scrapy.Spider):
|
|
9
|
+
"""
|
|
10
|
+
네이버에서 더 이상 제공하지 않는 서비스임. 비활성화시킴
|
|
11
|
+
"""
|
|
12
|
+
name = 'c108'
|
|
13
|
+
allowed_domains = ['navercomp.wisereport.co.kr'] # https 주소
|
|
14
|
+
|
|
15
|
+
def __init__(self, code, mongo_client, *args, **kwargs):
|
|
16
|
+
super(C108Spider, self).__init__(*args, **kwargs)
|
|
17
|
+
self.codes = common.adjust_arg_type(code)
|
|
18
|
+
self.mongo_client = mongo_client
|
|
19
|
+
|
|
20
|
+
def start_requests(self):
|
|
21
|
+
total_count = len(self.codes)
|
|
22
|
+
print(f'Start scraping {self.name}, {total_count} items...')
|
|
23
|
+
self.logger.info(f'entire codes list - {self.codes}')
|
|
24
|
+
for i, one_code in enumerate(self.codes):
|
|
25
|
+
print(f'{i + 1}/{total_count}. Parsing {self.name}...{one_code}')
|
|
26
|
+
# reference from https://docs.scrapy.org/en/latest/topics/request-response.html
|
|
27
|
+
yield scrapy.Request(url=f'https://navercomp.wisereport.co.kr/v2/company/c1080001.aspx?cmp_cd={one_code}',
|
|
28
|
+
callback=self.parse_c108,
|
|
29
|
+
cb_kwargs=dict(code=one_code)
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
def parse_c108(self, response, code):
|
|
33
|
+
"""
|
|
34
|
+
C108 분석한 XPATH
|
|
35
|
+
date_XPATH = f'//*[@id="tableCmpDetail"]/tr[{(i * 2) + 1}]/td[1]' # 날짜
|
|
36
|
+
content_XPATH = f'//*[@id="td{i}"]/@data-content' # 제목과내용
|
|
37
|
+
author_XPATH = f'//*[@id="tableCmpDetail"]/tr[{(i * 2) + 1}]/td[3]/div' # 작성자
|
|
38
|
+
company_XPATH = f'//*[@id="tableCmpDetail"]/tr[{(i * 2) + 1}]/td[4]/div' # 제공처
|
|
39
|
+
opinion_XPATH = f'//*[@id="tableCmpDetail"]/tr[{(i * 2) + 1}]/td[5]/div[1]' # 투자의견
|
|
40
|
+
hprice_XPATH = f'//*[@id="tableCmpDetail"]/tr[{(i * 2) + 1}]/td[6]/div[1]' # 목표가
|
|
41
|
+
"""
|
|
42
|
+
# c108페이지가 아직 만들어진 경우 빈테이블을 만들어 저장한다.
|
|
43
|
+
title = response.xpath('/html/head/title/text()').get()
|
|
44
|
+
if title == '기업재무정보 접속장애':
|
|
45
|
+
self.logger.warning(f'{code}: None C108 data...We will make empty table..')
|
|
46
|
+
# make item to yield
|
|
47
|
+
item = items.C108items()
|
|
48
|
+
item['코드'] = code
|
|
49
|
+
item['df'] = pd.DataFrame(columns=['날짜', '제목', '작성자', '제공처', '투자의견', '목표가', '내용'])
|
|
50
|
+
yield item
|
|
51
|
+
return
|
|
52
|
+
|
|
53
|
+
# reference from http://hleecaster.com/python-pandas-creating-and-loading-dataframes/(데이터프레임 만들기)
|
|
54
|
+
opinions = []
|
|
55
|
+
# 표의 아이템 갯수는 최대 20개
|
|
56
|
+
for i in range(20):
|
|
57
|
+
one_opinion = list()
|
|
58
|
+
# 1. 날짜추가
|
|
59
|
+
one_opinion.append(response.xpath(f'//*[@id="tableCmpDetail"]/tr[{(i * 2) + 1}]/td[1]/text()').get())
|
|
60
|
+
# 제목과 내용을 추출한 리스트에서 제목따로 내용따로 추출
|
|
61
|
+
try:
|
|
62
|
+
title_and_contents = response.xpath(f'//*[@id="td{i}"]/@data-content').get()\
|
|
63
|
+
.replace("<br/><span class='comment_text'>▶</span>", '▶').split('\n')
|
|
64
|
+
self.logger.debug(title_and_contents)
|
|
65
|
+
except AttributeError:
|
|
66
|
+
break
|
|
67
|
+
# 2. 제목을 추출하고 리스트에서 지움
|
|
68
|
+
one_opinion.append(title_and_contents[0])
|
|
69
|
+
del title_and_contents[0]
|
|
70
|
+
# 3. 작성자, 제공처, 투자의견, 목표가를 차례로 추출하고 리스트에 추가
|
|
71
|
+
for j in range(3, 7):
|
|
72
|
+
one_opinion.append(response.xpath(f'//*[@id="tableCmpDetail"]/tr[{(i * 2) + 1}]/td[{j}]/div/text()')
|
|
73
|
+
.get().replace('\t', '').replace('\r\n', ''))
|
|
74
|
+
# 4. 내용을 추출하고 개행문자를 첨가함
|
|
75
|
+
contents = ''
|
|
76
|
+
for item in title_and_contents:
|
|
77
|
+
contents += item
|
|
78
|
+
one_opinion.append(contents)
|
|
79
|
+
self.logger.debug(one_opinion)
|
|
80
|
+
# 5. 완성된 row를 opinions리스트에 넣는다.
|
|
81
|
+
opinions.append(one_opinion)
|
|
82
|
+
self.logger.info(f'\ttotal {len(opinions)} opinions.. {code}')
|
|
83
|
+
|
|
84
|
+
df = pd.DataFrame(data=opinions, columns=['날짜', '제목', '작성자', '제공처', '투자의견', '목표가', '내용'])
|
|
85
|
+
df['내용'] = df['내용'].str.replace('\r', '')
|
|
86
|
+
|
|
87
|
+
self.logger.debug(df)
|
|
88
|
+
# make item to yield
|
|
89
|
+
item = items.C108items()
|
|
90
|
+
item['코드'] = code
|
|
91
|
+
item['df'] = df
|
|
92
|
+
yield item
|
|
93
|
+
|
|
94
|
+
def __str__(self):
|
|
95
|
+
return 'C108 Spider'
|
|
@@ -1,8 +1,6 @@
|
|
|
1
1
|
import scrapy
|
|
2
2
|
from nfs import items
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
# cmd usage : scrapy crawl c101 -a codes=005930
|
|
3
|
+
from nfs.spiders import common
|
|
6
4
|
|
|
7
5
|
# 여러 잡다한 기호를 없어거나 교체하는 람다함수
|
|
8
6
|
cleaning = lambda s: (
|
|
@@ -18,15 +16,9 @@ class C101Spider(scrapy.Spider):
|
|
|
18
16
|
name = 'c101'
|
|
19
17
|
allowed_domains = ['navercomp.wisereport.co.kr'] # https 주소
|
|
20
18
|
|
|
21
|
-
def __init__(self,
|
|
19
|
+
def __init__(self, code, mongo_client, *args, **kwargs):
|
|
22
20
|
super(C101Spider, self).__init__(*args, **kwargs)
|
|
23
|
-
|
|
24
|
-
if type(codes) == str:
|
|
25
|
-
self.codes = [codes, ]
|
|
26
|
-
elif type(codes) == list:
|
|
27
|
-
self.codes = codes
|
|
28
|
-
else:
|
|
29
|
-
raise TypeError
|
|
21
|
+
self.codes = common.adjust_arg_type(code)
|
|
30
22
|
self.mongo_client = mongo_client
|
|
31
23
|
|
|
32
24
|
def start_requests(self):
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
import time
|
|
2
|
+
import scrapy
|
|
3
|
+
from util_hj3415 import utils
|
|
4
|
+
from scrapy.selector import Selector
|
|
5
|
+
from abc import *
|
|
6
|
+
|
|
7
|
+
from nfs import items
|
|
8
|
+
from nfs.spiders import common
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class C103Spider(scrapy.Spider, metaclass=ABCMeta):
|
|
12
|
+
name = 'c103'
|
|
13
|
+
allowed_domains = ['navercomp.wisereport.co.kr']
|
|
14
|
+
WAIT = 1.5
|
|
15
|
+
|
|
16
|
+
def __init__(self, code, mongo_client, title, *args, **kwargs):
|
|
17
|
+
"""
|
|
18
|
+
C103페이지의 기반 클래스. 세부 페이지에서 상속해서 사용한다.
|
|
19
|
+
|
|
20
|
+
:param code:
|
|
21
|
+
:param title: 상속된 C103 클래스에서 입력함
|
|
22
|
+
:param mongo_client:
|
|
23
|
+
:param args:
|
|
24
|
+
:param kwargs:
|
|
25
|
+
"""
|
|
26
|
+
super(C103Spider, self).__init__(*args, **kwargs)
|
|
27
|
+
self.codes = common.adjust_arg_type(code)
|
|
28
|
+
self.mongo_client = mongo_client
|
|
29
|
+
self.driver = utils.get_driver()
|
|
30
|
+
self.title = title # ex- 재무상태표q
|
|
31
|
+
|
|
32
|
+
def start_requests(self):
|
|
33
|
+
# reference from https://docs.scrapy.org/en/latest/topics/request-response.html
|
|
34
|
+
total_count = len(self.codes)
|
|
35
|
+
print(f'Start scraping {self.name}, {total_count} codes...')
|
|
36
|
+
self.logger.info(f'entire codes list - {self.codes}')
|
|
37
|
+
|
|
38
|
+
# 실제로 페이지를 스크랩하기위해 호출
|
|
39
|
+
for i, one_code in enumerate(self.codes):
|
|
40
|
+
print(f'{i + 1}/{total_count}. Parsing {self.title}...{one_code}')
|
|
41
|
+
yield scrapy.Request(
|
|
42
|
+
url=f'https://navercomp.wisereport.co.kr/v2/company/c1030001.aspx?cmp_cd={one_code}',
|
|
43
|
+
callback=getattr(self, f'parse_c103'),
|
|
44
|
+
cb_kwargs=dict(code=one_code)
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
def parse_c103(self, response, code):
|
|
48
|
+
# 페이지를 먼저 한번 호출하여 버튼을 눌러 세팅한다.
|
|
49
|
+
self.setting_page(response.url)
|
|
50
|
+
|
|
51
|
+
# html에서 table을 추출하여 dataframe생성
|
|
52
|
+
self.driver.get(response.url)
|
|
53
|
+
time.sleep(self.WAIT)
|
|
54
|
+
html = Selector(text=self.driver.page_source)
|
|
55
|
+
table_xpath = '//table[2]'
|
|
56
|
+
df = common.get_df_from_html(html, table_xpath, 1)
|
|
57
|
+
self.logger.debug(df)
|
|
58
|
+
|
|
59
|
+
# make item to yield
|
|
60
|
+
item = items.C103items()
|
|
61
|
+
item['코드'] = code
|
|
62
|
+
item['title'] = self.title
|
|
63
|
+
item['df'] = df
|
|
64
|
+
yield item
|
|
65
|
+
|
|
66
|
+
@abstractmethod
|
|
67
|
+
def setting_page(self, url: str):
|
|
68
|
+
pass
|
|
69
|
+
|
|
70
|
+
def __del__(self):
|
|
71
|
+
if self.driver is not None:
|
|
72
|
+
print(f'Retrieve {self.name} chrome driver...')
|
|
73
|
+
self.driver.quit()
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
'''
|
|
77
|
+
# XPATH 상수
|
|
78
|
+
손익계산서 = '//*[@id="rpt_tab1"]'
|
|
79
|
+
재무상태표 = '//*[@id="rpt_tab2"]'
|
|
80
|
+
현금흐름표 = '//*[@id="rpt_tab3"]'
|
|
81
|
+
연간 = '//*[@id="frqTyp0"]'
|
|
82
|
+
분기 = '//*[@id="frqTyp1"]'
|
|
83
|
+
검색 = '//*[@id="hfinGubun"]'
|
|
84
|
+
'''
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
class C103BQ(C103Spider):
|
|
88
|
+
name = 'c103_bq'
|
|
89
|
+
|
|
90
|
+
def __init__(self, code, mongo_client):
|
|
91
|
+
super(C103BQ, self).__init__(code, mongo_client, title='재무상태표q')
|
|
92
|
+
|
|
93
|
+
def setting_page(self, url: str):
|
|
94
|
+
buttons = [
|
|
95
|
+
('재무상태표', '//*[@id="rpt_tab2"]'),
|
|
96
|
+
('분기', '//*[@id="frqTyp1"]'),
|
|
97
|
+
('검색', '//*[@id="hfinGubun"]'),
|
|
98
|
+
]
|
|
99
|
+
common.click_buttons(self.driver, url, buttons, self.WAIT)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
class C103CQ(C103Spider):
|
|
103
|
+
name = 'c103_cq'
|
|
104
|
+
|
|
105
|
+
def __init__(self, code, mongo_client):
|
|
106
|
+
super().__init__(code, mongo_client, title='현금흐름표q')
|
|
107
|
+
|
|
108
|
+
def setting_page(self, url: str):
|
|
109
|
+
buttons = [
|
|
110
|
+
('현금흐름표', '//*[@id="rpt_tab3"]'),
|
|
111
|
+
('분기', '//*[@id="frqTyp1"]'),
|
|
112
|
+
('검색', '//*[@id="hfinGubun"]'),
|
|
113
|
+
]
|
|
114
|
+
common.click_buttons(self.driver, url, buttons, self.WAIT)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
class C103IQ(C103Spider):
|
|
118
|
+
name = 'c103_iq'
|
|
119
|
+
|
|
120
|
+
def __init__(self, code, mongo_client):
|
|
121
|
+
super().__init__(code, mongo_client, title='손익계산서q')
|
|
122
|
+
|
|
123
|
+
def setting_page(self, url: str):
|
|
124
|
+
buttons = [
|
|
125
|
+
('손익계산서', '//*[@id="rpt_tab1"]'),
|
|
126
|
+
('분기', '//*[@id="frqTyp1"]'),
|
|
127
|
+
('검색', '//*[@id="hfinGubun"]'),
|
|
128
|
+
]
|
|
129
|
+
common.click_buttons(self.driver, url, buttons, self.WAIT)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
class C103BY(C103Spider):
|
|
133
|
+
name = 'c103_by'
|
|
134
|
+
|
|
135
|
+
def __init__(self, code, mongo_client):
|
|
136
|
+
super().__init__(code, mongo_client, title='재무상태표y')
|
|
137
|
+
|
|
138
|
+
def setting_page(self, url: str):
|
|
139
|
+
buttons = [
|
|
140
|
+
('재무상태표', '//*[@id="rpt_tab2"]'),
|
|
141
|
+
('연간', '//*[@id="frqTyp0"]'),
|
|
142
|
+
('검색', '//*[@id="hfinGubun"]'),
|
|
143
|
+
]
|
|
144
|
+
common.click_buttons(self.driver, url, buttons, self.WAIT)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
class C103CY(C103Spider):
|
|
148
|
+
name = 'c103_cy'
|
|
149
|
+
|
|
150
|
+
def __init__(self, code, mongo_client):
|
|
151
|
+
super().__init__(code, mongo_client, title='현금흐름표y')
|
|
152
|
+
|
|
153
|
+
def setting_page(self, url: str):
|
|
154
|
+
buttons = [
|
|
155
|
+
('현금흐름표', '//*[@id="rpt_tab3"]'),
|
|
156
|
+
('연간', '//*[@id="frqTyp0"]'),
|
|
157
|
+
('검색', '//*[@id="hfinGubun"]'),
|
|
158
|
+
]
|
|
159
|
+
common.click_buttons(self.driver, url, buttons, self.WAIT)
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
class C103IY(C103Spider):
|
|
163
|
+
name = 'c103_iy'
|
|
164
|
+
|
|
165
|
+
def __init__(self, code, mongo_client):
|
|
166
|
+
super().__init__(code, mongo_client, title='손익계산서y')
|
|
167
|
+
|
|
168
|
+
def setting_page(self, url: str):
|
|
169
|
+
buttons = [
|
|
170
|
+
('손익계산서', '//*[@id="rpt_tab1"]'),
|
|
171
|
+
('연간', '//*[@id="frqTyp0"]'),
|
|
172
|
+
('검색', '//*[@id="hfinGubun"]'),
|
|
173
|
+
]
|
|
174
|
+
common.click_buttons(self.driver, url, buttons, self.WAIT)
|
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
import time
|
|
2
|
+
import scrapy
|
|
3
|
+
from util_hj3415 import utils
|
|
4
|
+
from scrapy.selector import Selector
|
|
5
|
+
from abc import *
|
|
6
|
+
|
|
7
|
+
from nfs import items
|
|
8
|
+
from nfs.spiders import common
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class C104Spider(scrapy.Spider, metaclass=ABCMeta):
|
|
12
|
+
name = 'c104'
|
|
13
|
+
allowed_domains = ['navercomp.wisereport.co.kr']
|
|
14
|
+
WAIT = 1.5
|
|
15
|
+
|
|
16
|
+
def __init__(self, code, mongo_client, title, *args, **kwargs):
|
|
17
|
+
"""
|
|
18
|
+
C104페이지의 기반 클래스. 세부 페이지에서 상속해서 사용한다.
|
|
19
|
+
|
|
20
|
+
:param code:
|
|
21
|
+
:param title: 상속된 C104 클래스에서 입력함
|
|
22
|
+
:param mongo_client:
|
|
23
|
+
:param args:
|
|
24
|
+
:param kwargs:
|
|
25
|
+
"""
|
|
26
|
+
super(C104Spider, self).__init__(*args, **kwargs)
|
|
27
|
+
self.codes = common.adjust_arg_type(code)
|
|
28
|
+
self.mongo_client = mongo_client
|
|
29
|
+
self.driver = utils.get_driver()
|
|
30
|
+
self.title = title # ex- 수익성q
|
|
31
|
+
|
|
32
|
+
def start_requests(self):
|
|
33
|
+
# reference from https://docs.scrapy.org/en/latest/topics/request-response.html
|
|
34
|
+
total_count = len(self.codes)
|
|
35
|
+
print(f'Start scraping {self.name}, {total_count} codes...')
|
|
36
|
+
self.logger.info(f'entire codes list - {self.codes}')
|
|
37
|
+
|
|
38
|
+
# 실제로 페이지를 스크랩하기위해 호출
|
|
39
|
+
for i, one_code in enumerate(self.codes):
|
|
40
|
+
print(f'{i + 1}/{total_count}. Parsing {self.title}...{one_code}')
|
|
41
|
+
yield scrapy.Request(
|
|
42
|
+
url=f'https://navercomp.wisereport.co.kr/v2/company/c1040001.aspx?cmp_cd={one_code}',
|
|
43
|
+
callback=getattr(self, f'parse_c104'),
|
|
44
|
+
cb_kwargs=dict(code=one_code)
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
def parse_c104(self, response, code):
|
|
48
|
+
# 페이지를 먼저 한번 호출하여 버튼을 눌러 세팅한다.
|
|
49
|
+
self.setting_page(response.url)
|
|
50
|
+
|
|
51
|
+
# html에서 table을 추출하여 dataframe생성
|
|
52
|
+
self.driver.get(response.url)
|
|
53
|
+
time.sleep(self.WAIT)
|
|
54
|
+
html = Selector(text=self.driver.page_source)
|
|
55
|
+
table_xpath = '//table[@class="gHead01 all-width data-list"]'
|
|
56
|
+
|
|
57
|
+
# 테이블명을 _을 기준으로 나눠 리스트를 만든다.
|
|
58
|
+
title_list = self.title.split('_')
|
|
59
|
+
self.logger.debug(title_list)
|
|
60
|
+
|
|
61
|
+
# dataframe 리스트를 만든다.
|
|
62
|
+
df_list = []
|
|
63
|
+
for i in range(2):
|
|
64
|
+
# 상위테이블 0, 하위테이블 1
|
|
65
|
+
df_list.append(common.get_df_from_html(html, table_xpath, i))
|
|
66
|
+
self.logger.debug(df_list)
|
|
67
|
+
|
|
68
|
+
# 테이블명리스트와 df리스트를 매치하여 데이터베이스에 저장하기 위해 yield시킴
|
|
69
|
+
for title, df in list(zip(title_list, df_list)):
|
|
70
|
+
# df를 log로 출력한다.
|
|
71
|
+
self.logger.info(title)
|
|
72
|
+
self.logger.debug(df)
|
|
73
|
+
# make item to yield
|
|
74
|
+
item = items.C104items()
|
|
75
|
+
item['코드'] = code
|
|
76
|
+
item['title'] = title
|
|
77
|
+
item['df'] = df
|
|
78
|
+
yield item
|
|
79
|
+
|
|
80
|
+
@abstractmethod
|
|
81
|
+
def setting_page(self, url: str):
|
|
82
|
+
pass
|
|
83
|
+
|
|
84
|
+
def __del__(self):
|
|
85
|
+
if self.driver is not None:
|
|
86
|
+
print(f'Retrieve {self.name} chrome driver...')
|
|
87
|
+
self.driver.quit()
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
'''
|
|
91
|
+
# XPATH 상수
|
|
92
|
+
수익성 = '//*[ @id="val_tab1"]'
|
|
93
|
+
성장성 = '//*[ @id="val_tab2"]'
|
|
94
|
+
안정성 = '//*[ @id="val_tab3"]'
|
|
95
|
+
활동성 = '//*[ @id="val_tab4"]'
|
|
96
|
+
|
|
97
|
+
연간 = '//*[@id="frqTyp0"]'
|
|
98
|
+
분기 = '//*[@id="frqTyp1"]'
|
|
99
|
+
검색 = '//*[@id="hfinGubun"]'
|
|
100
|
+
|
|
101
|
+
가치분석연간 = '//*[@id="frqTyp0_2"]'
|
|
102
|
+
가치분석분기 = '//*[@id="frqTyp1_2"]'
|
|
103
|
+
가치분석검색 = '//*[@id="hfinGubun2"]'
|
|
104
|
+
'''
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
class C104AQ(C104Spider):
|
|
108
|
+
name = 'c104_aq'
|
|
109
|
+
|
|
110
|
+
def __init__(self, code, mongo_client):
|
|
111
|
+
super(C104AQ, self).__init__(code, mongo_client, title='수익성q_가치분석q')
|
|
112
|
+
|
|
113
|
+
def setting_page(self, url: str):
|
|
114
|
+
buttons = [
|
|
115
|
+
('수익성', '//*[ @id="val_tab1"]'),
|
|
116
|
+
('분기', '//*[@id="frqTyp1"]'),
|
|
117
|
+
('검색', '//*[@id="hfinGubun"]'),
|
|
118
|
+
('가치분석분기', '//*[@id="frqTyp1_2"]'),
|
|
119
|
+
('가치분석검색', '//*[@id="hfinGubun2"]'),
|
|
120
|
+
]
|
|
121
|
+
common.click_buttons(self.driver, url, buttons, self.WAIT)
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
class C104BQ(C104Spider):
|
|
125
|
+
name = 'c104_bq'
|
|
126
|
+
|
|
127
|
+
def __init__(self, code, mongo_client):
|
|
128
|
+
super(C104BQ, self).__init__(code, mongo_client, title='성장성q')
|
|
129
|
+
|
|
130
|
+
def setting_page(self, url: str):
|
|
131
|
+
buttons = [
|
|
132
|
+
('성장성', '//*[ @id="val_tab2"]'),
|
|
133
|
+
('분기', '//*[@id="frqTyp1"]'),
|
|
134
|
+
('검색', '//*[@id="hfinGubun"]'),
|
|
135
|
+
]
|
|
136
|
+
common.click_buttons(self.driver, url, buttons, self.WAIT)
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
class C104CQ(C104Spider):
|
|
140
|
+
name = 'c104_cq'
|
|
141
|
+
|
|
142
|
+
def __init__(self, code, mongo_client):
|
|
143
|
+
super(C104CQ, self).__init__(code, mongo_client, title='안정성q')
|
|
144
|
+
|
|
145
|
+
def setting_page(self, url: str):
|
|
146
|
+
buttons = [
|
|
147
|
+
('안정성', '//*[ @id="val_tab3"]'),
|
|
148
|
+
('분기', '//*[@id="frqTyp1"]'),
|
|
149
|
+
('검색', '//*[@id="hfinGubun"]'),
|
|
150
|
+
]
|
|
151
|
+
common.click_buttons(self.driver, url, buttons, self.WAIT)
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
class C104DQ(C104Spider):
|
|
155
|
+
name = 'c104_dq'
|
|
156
|
+
|
|
157
|
+
def __init__(self, code, mongo_client):
|
|
158
|
+
super(C104DQ, self).__init__(code, mongo_client, title='활동성q')
|
|
159
|
+
|
|
160
|
+
def setting_page(self, url: str):
|
|
161
|
+
buttons = [
|
|
162
|
+
('활동성', '//*[ @id="val_tab4"]'),
|
|
163
|
+
('분기', '//*[@id="frqTyp1"]'),
|
|
164
|
+
('검색', '//*[@id="hfinGubun"]'),
|
|
165
|
+
]
|
|
166
|
+
common.click_buttons(self.driver, url, buttons, self.WAIT)
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
class C104AY(C104Spider):
|
|
170
|
+
name = 'c104_ay'
|
|
171
|
+
|
|
172
|
+
def __init__(self, code, mongo_client):
|
|
173
|
+
super(C104AY, self).__init__(code, mongo_client, title='수익성y_가치분석y')
|
|
174
|
+
|
|
175
|
+
def setting_page(self, url: str):
|
|
176
|
+
buttons = [
|
|
177
|
+
('수익성', '//*[ @id="val_tab1"]'),
|
|
178
|
+
('연간', '//*[@id="frqTyp0"]'),
|
|
179
|
+
('검색', '//*[@id="hfinGubun"]'),
|
|
180
|
+
('가치분석연간', '//*[@id="frqTyp0_2"]'),
|
|
181
|
+
('가치분석검색', '//*[@id="hfinGubun2"]'),
|
|
182
|
+
]
|
|
183
|
+
common.click_buttons(self.driver, url, buttons, self.WAIT)
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
class C104BY(C104Spider):
|
|
187
|
+
name = 'c104_by'
|
|
188
|
+
|
|
189
|
+
def __init__(self, code, mongo_client):
|
|
190
|
+
super(C104BY, self).__init__(code, mongo_client, title='성장성y')
|
|
191
|
+
|
|
192
|
+
def setting_page(self, url: str):
|
|
193
|
+
buttons = [
|
|
194
|
+
('성장성', '//*[ @id="val_tab2"]'),
|
|
195
|
+
('연간', '//*[@id="frqTyp0"]'),
|
|
196
|
+
('검색', '//*[@id="hfinGubun"]'),
|
|
197
|
+
]
|
|
198
|
+
common.click_buttons(self.driver, url, buttons, self.WAIT)
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
class C104CY(C104Spider):
|
|
202
|
+
name = 'c104_cy'
|
|
203
|
+
|
|
204
|
+
def __init__(self, code, mongo_client):
|
|
205
|
+
super(C104CY, self).__init__(code, mongo_client, title='안정성y')
|
|
206
|
+
|
|
207
|
+
def setting_page(self, url: str):
|
|
208
|
+
buttons = [
|
|
209
|
+
('안정성', '//*[ @id="val_tab3"]'),
|
|
210
|
+
('연간', '//*[@id="frqTyp0"]'),
|
|
211
|
+
('검색', '//*[@id="hfinGubun"]'),
|
|
212
|
+
]
|
|
213
|
+
common.click_buttons(self.driver, url, buttons, self.WAIT)
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
class C104DY(C104Spider):
|
|
217
|
+
name = 'c104_dy'
|
|
218
|
+
|
|
219
|
+
def __init__(self, code, mongo_client):
|
|
220
|
+
super(C104DY, self).__init__(code, mongo_client, title='활동성y')
|
|
221
|
+
|
|
222
|
+
def setting_page(self, url: str):
|
|
223
|
+
buttons = [
|
|
224
|
+
('활동성', '//*[ @id="val_tab4"]'),
|
|
225
|
+
('연간', '//*[@id="frqTyp0"]'),
|
|
226
|
+
('검색', '//*[@id="hfinGubun"]'),
|
|
227
|
+
]
|
|
228
|
+
common.click_buttons(self.driver, url, buttons, self.WAIT)
|