scraper2-hj3415 0.0.3__py3-none-any.whl → 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. scraper2_hj3415/cli.py +31 -11
  2. scraper2_hj3415/miscrapy/mi/__init__.py +0 -0
  3. scraper2_hj3415/miscrapy/mi/items.py +7 -0
  4. scraper2_hj3415/miscrapy/mi/middlewares.py +103 -0
  5. scraper2_hj3415/miscrapy/mi/pipelines.py +39 -0
  6. scraper2_hj3415/miscrapy/mi/settings.py +103 -0
  7. scraper2_hj3415/miscrapy/mi/spiders/__init__.py +4 -0
  8. scraper2_hj3415/miscrapy/mi/spiders/aud.py +23 -0
  9. scraper2_hj3415/miscrapy/mi/spiders/chf.py +25 -0
  10. scraper2_hj3415/miscrapy/mi/spiders/gbond3y.py +24 -0
  11. scraper2_hj3415/miscrapy/mi/spiders/gold.py +25 -0
  12. scraper2_hj3415/miscrapy/mi/spiders/kosdaq.py +23 -0
  13. scraper2_hj3415/miscrapy/mi/spiders/kospi.py +23 -0
  14. scraper2_hj3415/miscrapy/mi/spiders/mihistory.py +333 -0
  15. scraper2_hj3415/miscrapy/mi/spiders/silver.py +25 -0
  16. scraper2_hj3415/miscrapy/mi/spiders/sp500.py +24 -0
  17. scraper2_hj3415/miscrapy/mi/spiders/usdidx.py +30 -0
  18. scraper2_hj3415/miscrapy/mi/spiders/usdkrw.py +24 -0
  19. scraper2_hj3415/miscrapy/mi/spiders/wti.py +25 -0
  20. scraper2_hj3415/miscrapy/run.py +156 -0
  21. scraper2_hj3415/miscrapy/scrapy.cfg +11 -0
  22. scraper2_hj3415/nfscrapy/nfs/items.py +24 -0
  23. scraper2_hj3415/nfscrapy/nfs/pipelines.py +94 -16
  24. scraper2_hj3415/nfscrapy/nfs/settings.py +2 -1
  25. scraper2_hj3415/nfscrapy/nfs/spiders/_c108.py +95 -0
  26. scraper2_hj3415/nfscrapy/nfs/spiders/c101.py +3 -11
  27. scraper2_hj3415/nfscrapy/nfs/spiders/c103.py +174 -0
  28. scraper2_hj3415/nfscrapy/nfs/spiders/c104.py +228 -0
  29. scraper2_hj3415/nfscrapy/nfs/spiders/c106.py +92 -0
  30. scraper2_hj3415/nfscrapy/nfs/spiders/common.py +60 -0
  31. scraper2_hj3415/nfscrapy/run.py +76 -26
  32. {scraper2_hj3415-0.0.3.dist-info → scraper2_hj3415-0.1.0.dist-info}/METADATA +1 -1
  33. scraper2_hj3415-0.1.0.dist-info/RECORD +42 -0
  34. scraper2_hj3415-0.1.0.dist-info/entry_points.txt +2 -0
  35. scraper2_hj3415-0.0.3.dist-info/RECORD +0 -17
  36. scraper2_hj3415-0.0.3.dist-info/entry_points.txt +0 -2
  37. {scraper2_hj3415-0.0.3.dist-info → scraper2_hj3415-0.1.0.dist-info}/WHEEL +0 -0
  38. {scraper2_hj3415-0.0.3.dist-info → scraper2_hj3415-0.1.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,26 +1,104 @@
1
1
  from itemadapter import ItemAdapter
2
2
  from pymongo import errors
3
+ from pprint import pprint
4
+ from util_hj3415 import mongo
3
5
 
6
+ import logging
7
+ logger = logging.getLogger(__name__)
8
+ formatter = logging.Formatter('%(levelname)s: [%(name)s] %(message)s')
9
+ ch = logging.StreamHandler()
10
+ ch.setFormatter(formatter)
11
+ logger.addHandler(ch)
12
+ logger.setLevel(logging.DEBUG)
4
13
 
5
- class MongoPipeline:
6
14
 
7
- collection_name = 'c101'
15
+ class ValidationPipeline:
16
+ def process_item(self, item, spider):
17
+ print(f"\tIn the {self.__class__.__name__}...", end="")
18
+ if spider.name == 'c101':
19
+ print(f" Manual calculating EPS, BPS, PER, PBR")
20
+ logger.debug('*** Start c101 pipeline ***')
21
+ logger.debug(f"Raw data - EPS:{item['EPS']} BPS:{item['BPS']} PER:{item['PER']} PBR:{item['PBR']}")
22
+ # eps, bps, per, pbr을 직접 계산해서 바꾸기 위해 c104 page를 찾는다.
23
+ try:
24
+ logger.debug('Try to get c104 page for calculate values..')
25
+ c104 = mongo.C104(spider.mongo_client, item['코드'], 'c104q')
26
+ d, eps = c104.sum_recent_4q('EPS') # 최근 4분기 eps값을 더한다.
27
+ d, bps = c104.latest_value('BPS') # 마지막 분기 bps값을 찾는다.
8
28
 
9
- def open_spider(self, spider):
10
- self.client = spider.mongo_client
29
+ # per, pbr을 구하는 람다함수
30
+ cal_ratio = (lambda eps_bps, pprice:
31
+ None if eps_bps is None or eps_bps == 0 else round(int(pprice) / int(eps_bps), 2))
32
+ cal_per = cal_ratio(eps, item['주가'])
33
+ cal_pbr = cal_ratio(bps, item['주가'])
34
+ logger.debug(f"After calc data - EPS:{eps} BPS:{bps} PER:{cal_per} PBR:{cal_pbr}")
35
+ logger.debug(f"*** End c101 calculation pipeline {item['코드']} ***")
36
+ except:
37
+ logger.warning("Error on calculating custom EPS, BPS, PER, PBR, maybe DB hasn't c104q collection.")
38
+ logger.warning(
39
+ f"We will use default scraped values - EPS:{item['EPS']} BPS:{item['BPS']} PER:{item['PER']} PBR:{item['PBR']}")
40
+ return item
41
+ item['EPS'], item['BPS'], item['PER'], item['PBR'] = eps, bps, cal_per, cal_pbr
42
+ if 'c103' in spider.name:
43
+ # pprint(item)
44
+ print(" Nothing special working")
45
+ if 'c104' in spider.name:
46
+ #pprint(item)
47
+ print(" Nothing special working")
48
+ if spider.name == 'c106':
49
+ pprint(item)
50
+ print(" Nothing special working")
51
+ return item
11
52
 
53
+
54
+ class MongoPipeline:
12
55
  def process_item(self, item, spider):
13
- print("In the MongoPipeline...", end="")
14
- if self.client is None:
15
- print("Skip the save to mongoDB.")
56
+ print(f"\tIn the {self.__class__.__name__}...", end="")
57
+ if spider.mongo_client is None:
58
+ print(f"Skipping to save the data... code : {item['코드']} / spider : {spider.name}")
16
59
  else:
17
- print("Save data to mongoDB.")
18
- my_collection = self.client[item['코드']][self.collection_name]
19
- try:
20
- my_collection.create_index('date', unique=True)
21
- my_collection.insert_one(ItemAdapter(item).asdict())
22
- except errors.DuplicateKeyError:
23
- # 스크랩한 데이터가 이미 데이터베이스에 있을경우 지우고 저장
24
- my_collection.delete_many({'date': {"$gte": item['date']}})
25
- my_collection.insert_one(ItemAdapter(item).asdict())
60
+ print(f"Saving the {spider.name} to mongoDB...", end="")
61
+ if spider.name == 'c101':
62
+ page = spider.name
63
+ print(f" code : {item['코드']} / page : {page}")
64
+ c101_col = spider.mongo_client[item['코드']][page]
65
+ try:
66
+ c101_col.create_index('date', unique=True)
67
+ c101_col.insert_one(ItemAdapter(item).asdict())
68
+ except errors.DuplicateKeyError:
69
+ # 스크랩한 데이터가 이미 데이터베이스에 있을경우 지우고 저장
70
+ c101_col.delete_many({'date': {"$gte": item['date']}})
71
+ c101_col.insert_one(ItemAdapter(item).asdict())
72
+ elif 'c103' in spider.name:
73
+ page = ''.join(['c103', item['title']])
74
+ print(f" code : {item['코드']} / page : {page}")
75
+ #logging.debug(item['df'].to_dict('records'))
76
+ #r = mongo2.C103(self.client, item['코드'], page).save(item['df'])
77
+ elif 'c104' in spider.name:
78
+ if item['title'].endswith('y'):
79
+ page = 'c104y'
80
+ elif item['title'].endswith('q'):
81
+ page = 'c104q'
82
+ else:
83
+ raise ValueError
84
+ print(f" code : {item['코드']} / page : {page}({item['title']})")
85
+ logging.debug(item['df'].to_dict('records'))
86
+ r = mongo.C104(spider.mongo_client, item['코드'], page).save_df(item['df'])
87
+ elif spider.name == 'c106':
88
+ page = ''.join(['c106', item['title']])
89
+ print(f" code : {item['코드']} / page : {page}")
90
+ #logging.debug(item['df'].to_dict('records'))
91
+ #if page == 'c106y':
92
+ # r = mongo2.C106Y(self.client, item['코드']).save(item['df'])
93
+ #elif page == 'c106q':
94
+ # r = mongo2.C106Q(self.client, item['코드']).save(item['df'])
95
+ #else:
96
+ # raise
97
+ """
98
+ elif spider.name == 'c108':
99
+ page = spider.name
100
+ print(f" code : {item['코드']} / page : {page}")
101
+ logging.debug(item['df'].to_dict('records'))
102
+ r = mongo2.C108(self.client, item['코드']).save(item['df'])
103
+ """
26
104
  return item
@@ -65,7 +65,8 @@ COOKIES_ENABLED = False
65
65
  # Configure item pipelines
66
66
  # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
67
67
  ITEM_PIPELINES = {
68
- "nfs.pipelines.MongoPipeline": 300,
68
+ "nfs.pipelines.ValidationPipeline": 300,
69
+ "nfs.pipelines.MongoPipeline": 400,
69
70
  }
70
71
 
71
72
  # Enable and configure the AutoThrottle extension (disabled by default)
@@ -0,0 +1,95 @@
1
+ import scrapy
2
+ import pandas as pd
3
+
4
+ from nfs import items
5
+ from nfs.spiders import common
6
+
7
+
8
+ class C108Spider(scrapy.Spider):
9
+ """
10
+ 네이버에서 더 이상 제공하지 않는 서비스임. 비활성화시킴
11
+ """
12
+ name = 'c108'
13
+ allowed_domains = ['navercomp.wisereport.co.kr'] # https 주소
14
+
15
+ def __init__(self, code, mongo_client, *args, **kwargs):
16
+ super(C108Spider, self).__init__(*args, **kwargs)
17
+ self.codes = common.adjust_arg_type(code)
18
+ self.mongo_client = mongo_client
19
+
20
+ def start_requests(self):
21
+ total_count = len(self.codes)
22
+ print(f'Start scraping {self.name}, {total_count} items...')
23
+ self.logger.info(f'entire codes list - {self.codes}')
24
+ for i, one_code in enumerate(self.codes):
25
+ print(f'{i + 1}/{total_count}. Parsing {self.name}...{one_code}')
26
+ # reference from https://docs.scrapy.org/en/latest/topics/request-response.html
27
+ yield scrapy.Request(url=f'https://navercomp.wisereport.co.kr/v2/company/c1080001.aspx?cmp_cd={one_code}',
28
+ callback=self.parse_c108,
29
+ cb_kwargs=dict(code=one_code)
30
+ )
31
+
32
+ def parse_c108(self, response, code):
33
+ """
34
+ C108 분석한 XPATH
35
+ date_XPATH = f'//*[@id="tableCmpDetail"]/tr[{(i * 2) + 1}]/td[1]' # 날짜
36
+ content_XPATH = f'//*[@id="td{i}"]/@data-content' # 제목과내용
37
+ author_XPATH = f'//*[@id="tableCmpDetail"]/tr[{(i * 2) + 1}]/td[3]/div' # 작성자
38
+ company_XPATH = f'//*[@id="tableCmpDetail"]/tr[{(i * 2) + 1}]/td[4]/div' # 제공처
39
+ opinion_XPATH = f'//*[@id="tableCmpDetail"]/tr[{(i * 2) + 1}]/td[5]/div[1]' # 투자의견
40
+ hprice_XPATH = f'//*[@id="tableCmpDetail"]/tr[{(i * 2) + 1}]/td[6]/div[1]' # 목표가
41
+ """
42
+ # c108페이지가 아직 만들어진 경우 빈테이블을 만들어 저장한다.
43
+ title = response.xpath('/html/head/title/text()').get()
44
+ if title == '기업재무정보 접속장애':
45
+ self.logger.warning(f'{code}: None C108 data...We will make empty table..')
46
+ # make item to yield
47
+ item = items.C108items()
48
+ item['코드'] = code
49
+ item['df'] = pd.DataFrame(columns=['날짜', '제목', '작성자', '제공처', '투자의견', '목표가', '내용'])
50
+ yield item
51
+ return
52
+
53
+ # reference from http://hleecaster.com/python-pandas-creating-and-loading-dataframes/(데이터프레임 만들기)
54
+ opinions = []
55
+ # 표의 아이템 갯수는 최대 20개
56
+ for i in range(20):
57
+ one_opinion = list()
58
+ # 1. 날짜추가
59
+ one_opinion.append(response.xpath(f'//*[@id="tableCmpDetail"]/tr[{(i * 2) + 1}]/td[1]/text()').get())
60
+ # 제목과 내용을 추출한 리스트에서 제목따로 내용따로 추출
61
+ try:
62
+ title_and_contents = response.xpath(f'//*[@id="td{i}"]/@data-content').get()\
63
+ .replace("<br/><span class='comment_text'>▶</span>", '▶').split('\n')
64
+ self.logger.debug(title_and_contents)
65
+ except AttributeError:
66
+ break
67
+ # 2. 제목을 추출하고 리스트에서 지움
68
+ one_opinion.append(title_and_contents[0])
69
+ del title_and_contents[0]
70
+ # 3. 작성자, 제공처, 투자의견, 목표가를 차례로 추출하고 리스트에 추가
71
+ for j in range(3, 7):
72
+ one_opinion.append(response.xpath(f'//*[@id="tableCmpDetail"]/tr[{(i * 2) + 1}]/td[{j}]/div/text()')
73
+ .get().replace('\t', '').replace('\r\n', ''))
74
+ # 4. 내용을 추출하고 개행문자를 첨가함
75
+ contents = ''
76
+ for item in title_and_contents:
77
+ contents += item
78
+ one_opinion.append(contents)
79
+ self.logger.debug(one_opinion)
80
+ # 5. 완성된 row를 opinions리스트에 넣는다.
81
+ opinions.append(one_opinion)
82
+ self.logger.info(f'\ttotal {len(opinions)} opinions.. {code}')
83
+
84
+ df = pd.DataFrame(data=opinions, columns=['날짜', '제목', '작성자', '제공처', '투자의견', '목표가', '내용'])
85
+ df['내용'] = df['내용'].str.replace('\r', '')
86
+
87
+ self.logger.debug(df)
88
+ # make item to yield
89
+ item = items.C108items()
90
+ item['코드'] = code
91
+ item['df'] = df
92
+ yield item
93
+
94
+ def __str__(self):
95
+ return 'C108 Spider'
@@ -1,8 +1,6 @@
1
1
  import scrapy
2
2
  from nfs import items
3
- #from nfs.spiders import common
4
-
5
- # cmd usage : scrapy crawl c101 -a codes=005930
3
+ from nfs.spiders import common
6
4
 
7
5
  # 여러 잡다한 기호를 없어거나 교체하는 람다함수
8
6
  cleaning = lambda s: (
@@ -18,15 +16,9 @@ class C101Spider(scrapy.Spider):
18
16
  name = 'c101'
19
17
  allowed_domains = ['navercomp.wisereport.co.kr'] # https 주소
20
18
 
21
- def __init__(self, codes, mongo_client, *args, **kwargs):
19
+ def __init__(self, code, mongo_client, *args, **kwargs):
22
20
  super(C101Spider, self).__init__(*args, **kwargs)
23
- # 커맨드 라인에서 입력은 항상 문자열로만 받을수 있어서 리스트로 변환이 필요함.
24
- if type(codes) == str:
25
- self.codes = [codes, ]
26
- elif type(codes) == list:
27
- self.codes = codes
28
- else:
29
- raise TypeError
21
+ self.codes = common.adjust_arg_type(code)
30
22
  self.mongo_client = mongo_client
31
23
 
32
24
  def start_requests(self):
@@ -0,0 +1,174 @@
1
+ import time
2
+ import scrapy
3
+ from util_hj3415 import utils
4
+ from scrapy.selector import Selector
5
+ from abc import *
6
+
7
+ from nfs import items
8
+ from nfs.spiders import common
9
+
10
+
11
+ class C103Spider(scrapy.Spider, metaclass=ABCMeta):
12
+ name = 'c103'
13
+ allowed_domains = ['navercomp.wisereport.co.kr']
14
+ WAIT = 1.5
15
+
16
+ def __init__(self, code, mongo_client, title, *args, **kwargs):
17
+ """
18
+ C103페이지의 기반 클래스. 세부 페이지에서 상속해서 사용한다.
19
+
20
+ :param code:
21
+ :param title: 상속된 C103 클래스에서 입력함
22
+ :param mongo_client:
23
+ :param args:
24
+ :param kwargs:
25
+ """
26
+ super(C103Spider, self).__init__(*args, **kwargs)
27
+ self.codes = common.adjust_arg_type(code)
28
+ self.mongo_client = mongo_client
29
+ self.driver = utils.get_driver()
30
+ self.title = title # ex- 재무상태표q
31
+
32
+ def start_requests(self):
33
+ # reference from https://docs.scrapy.org/en/latest/topics/request-response.html
34
+ total_count = len(self.codes)
35
+ print(f'Start scraping {self.name}, {total_count} codes...')
36
+ self.logger.info(f'entire codes list - {self.codes}')
37
+
38
+ # 실제로 페이지를 스크랩하기위해 호출
39
+ for i, one_code in enumerate(self.codes):
40
+ print(f'{i + 1}/{total_count}. Parsing {self.title}...{one_code}')
41
+ yield scrapy.Request(
42
+ url=f'https://navercomp.wisereport.co.kr/v2/company/c1030001.aspx?cmp_cd={one_code}',
43
+ callback=getattr(self, f'parse_c103'),
44
+ cb_kwargs=dict(code=one_code)
45
+ )
46
+
47
+ def parse_c103(self, response, code):
48
+ # 페이지를 먼저 한번 호출하여 버튼을 눌러 세팅한다.
49
+ self.setting_page(response.url)
50
+
51
+ # html에서 table을 추출하여 dataframe생성
52
+ self.driver.get(response.url)
53
+ time.sleep(self.WAIT)
54
+ html = Selector(text=self.driver.page_source)
55
+ table_xpath = '//table[2]'
56
+ df = common.get_df_from_html(html, table_xpath, 1)
57
+ self.logger.debug(df)
58
+
59
+ # make item to yield
60
+ item = items.C103items()
61
+ item['코드'] = code
62
+ item['title'] = self.title
63
+ item['df'] = df
64
+ yield item
65
+
66
+ @abstractmethod
67
+ def setting_page(self, url: str):
68
+ pass
69
+
70
+ def __del__(self):
71
+ if self.driver is not None:
72
+ print(f'Retrieve {self.name} chrome driver...')
73
+ self.driver.quit()
74
+
75
+
76
+ '''
77
+ # XPATH 상수
78
+ 손익계산서 = '//*[@id="rpt_tab1"]'
79
+ 재무상태표 = '//*[@id="rpt_tab2"]'
80
+ 현금흐름표 = '//*[@id="rpt_tab3"]'
81
+ 연간 = '//*[@id="frqTyp0"]'
82
+ 분기 = '//*[@id="frqTyp1"]'
83
+ 검색 = '//*[@id="hfinGubun"]'
84
+ '''
85
+
86
+
87
+ class C103BQ(C103Spider):
88
+ name = 'c103_bq'
89
+
90
+ def __init__(self, code, mongo_client):
91
+ super(C103BQ, self).__init__(code, mongo_client, title='재무상태표q')
92
+
93
+ def setting_page(self, url: str):
94
+ buttons = [
95
+ ('재무상태표', '//*[@id="rpt_tab2"]'),
96
+ ('분기', '//*[@id="frqTyp1"]'),
97
+ ('검색', '//*[@id="hfinGubun"]'),
98
+ ]
99
+ common.click_buttons(self.driver, url, buttons, self.WAIT)
100
+
101
+
102
+ class C103CQ(C103Spider):
103
+ name = 'c103_cq'
104
+
105
+ def __init__(self, code, mongo_client):
106
+ super().__init__(code, mongo_client, title='현금흐름표q')
107
+
108
+ def setting_page(self, url: str):
109
+ buttons = [
110
+ ('현금흐름표', '//*[@id="rpt_tab3"]'),
111
+ ('분기', '//*[@id="frqTyp1"]'),
112
+ ('검색', '//*[@id="hfinGubun"]'),
113
+ ]
114
+ common.click_buttons(self.driver, url, buttons, self.WAIT)
115
+
116
+
117
+ class C103IQ(C103Spider):
118
+ name = 'c103_iq'
119
+
120
+ def __init__(self, code, mongo_client):
121
+ super().__init__(code, mongo_client, title='손익계산서q')
122
+
123
+ def setting_page(self, url: str):
124
+ buttons = [
125
+ ('손익계산서', '//*[@id="rpt_tab1"]'),
126
+ ('분기', '//*[@id="frqTyp1"]'),
127
+ ('검색', '//*[@id="hfinGubun"]'),
128
+ ]
129
+ common.click_buttons(self.driver, url, buttons, self.WAIT)
130
+
131
+
132
+ class C103BY(C103Spider):
133
+ name = 'c103_by'
134
+
135
+ def __init__(self, code, mongo_client):
136
+ super().__init__(code, mongo_client, title='재무상태표y')
137
+
138
+ def setting_page(self, url: str):
139
+ buttons = [
140
+ ('재무상태표', '//*[@id="rpt_tab2"]'),
141
+ ('연간', '//*[@id="frqTyp0"]'),
142
+ ('검색', '//*[@id="hfinGubun"]'),
143
+ ]
144
+ common.click_buttons(self.driver, url, buttons, self.WAIT)
145
+
146
+
147
+ class C103CY(C103Spider):
148
+ name = 'c103_cy'
149
+
150
+ def __init__(self, code, mongo_client):
151
+ super().__init__(code, mongo_client, title='현금흐름표y')
152
+
153
+ def setting_page(self, url: str):
154
+ buttons = [
155
+ ('현금흐름표', '//*[@id="rpt_tab3"]'),
156
+ ('연간', '//*[@id="frqTyp0"]'),
157
+ ('검색', '//*[@id="hfinGubun"]'),
158
+ ]
159
+ common.click_buttons(self.driver, url, buttons, self.WAIT)
160
+
161
+
162
+ class C103IY(C103Spider):
163
+ name = 'c103_iy'
164
+
165
+ def __init__(self, code, mongo_client):
166
+ super().__init__(code, mongo_client, title='손익계산서y')
167
+
168
+ def setting_page(self, url: str):
169
+ buttons = [
170
+ ('손익계산서', '//*[@id="rpt_tab1"]'),
171
+ ('연간', '//*[@id="frqTyp0"]'),
172
+ ('검색', '//*[@id="hfinGubun"]'),
173
+ ]
174
+ common.click_buttons(self.driver, url, buttons, self.WAIT)
@@ -0,0 +1,228 @@
1
+ import time
2
+ import scrapy
3
+ from util_hj3415 import utils
4
+ from scrapy.selector import Selector
5
+ from abc import *
6
+
7
+ from nfs import items
8
+ from nfs.spiders import common
9
+
10
+
11
+ class C104Spider(scrapy.Spider, metaclass=ABCMeta):
12
+ name = 'c104'
13
+ allowed_domains = ['navercomp.wisereport.co.kr']
14
+ WAIT = 1.5
15
+
16
+ def __init__(self, code, mongo_client, title, *args, **kwargs):
17
+ """
18
+ C104페이지의 기반 클래스. 세부 페이지에서 상속해서 사용한다.
19
+
20
+ :param code:
21
+ :param title: 상속된 C104 클래스에서 입력함
22
+ :param mongo_client:
23
+ :param args:
24
+ :param kwargs:
25
+ """
26
+ super(C104Spider, self).__init__(*args, **kwargs)
27
+ self.codes = common.adjust_arg_type(code)
28
+ self.mongo_client = mongo_client
29
+ self.driver = utils.get_driver()
30
+ self.title = title # ex- 수익성q
31
+
32
+ def start_requests(self):
33
+ # reference from https://docs.scrapy.org/en/latest/topics/request-response.html
34
+ total_count = len(self.codes)
35
+ print(f'Start scraping {self.name}, {total_count} codes...')
36
+ self.logger.info(f'entire codes list - {self.codes}')
37
+
38
+ # 실제로 페이지를 스크랩하기위해 호출
39
+ for i, one_code in enumerate(self.codes):
40
+ print(f'{i + 1}/{total_count}. Parsing {self.title}...{one_code}')
41
+ yield scrapy.Request(
42
+ url=f'https://navercomp.wisereport.co.kr/v2/company/c1040001.aspx?cmp_cd={one_code}',
43
+ callback=getattr(self, f'parse_c104'),
44
+ cb_kwargs=dict(code=one_code)
45
+ )
46
+
47
+ def parse_c104(self, response, code):
48
+ # 페이지를 먼저 한번 호출하여 버튼을 눌러 세팅한다.
49
+ self.setting_page(response.url)
50
+
51
+ # html에서 table을 추출하여 dataframe생성
52
+ self.driver.get(response.url)
53
+ time.sleep(self.WAIT)
54
+ html = Selector(text=self.driver.page_source)
55
+ table_xpath = '//table[@class="gHead01 all-width data-list"]'
56
+
57
+ # 테이블명을 _을 기준으로 나눠 리스트를 만든다.
58
+ title_list = self.title.split('_')
59
+ self.logger.debug(title_list)
60
+
61
+ # dataframe 리스트를 만든다.
62
+ df_list = []
63
+ for i in range(2):
64
+ # 상위테이블 0, 하위테이블 1
65
+ df_list.append(common.get_df_from_html(html, table_xpath, i))
66
+ self.logger.debug(df_list)
67
+
68
+ # 테이블명리스트와 df리스트를 매치하여 데이터베이스에 저장하기 위해 yield시킴
69
+ for title, df in list(zip(title_list, df_list)):
70
+ # df를 log로 출력한다.
71
+ self.logger.info(title)
72
+ self.logger.debug(df)
73
+ # make item to yield
74
+ item = items.C104items()
75
+ item['코드'] = code
76
+ item['title'] = title
77
+ item['df'] = df
78
+ yield item
79
+
80
+ @abstractmethod
81
+ def setting_page(self, url: str):
82
+ pass
83
+
84
+ def __del__(self):
85
+ if self.driver is not None:
86
+ print(f'Retrieve {self.name} chrome driver...')
87
+ self.driver.quit()
88
+
89
+
90
+ '''
91
+ # XPATH 상수
92
+ 수익성 = '//*[ @id="val_tab1"]'
93
+ 성장성 = '//*[ @id="val_tab2"]'
94
+ 안정성 = '//*[ @id="val_tab3"]'
95
+ 활동성 = '//*[ @id="val_tab4"]'
96
+
97
+ 연간 = '//*[@id="frqTyp0"]'
98
+ 분기 = '//*[@id="frqTyp1"]'
99
+ 검색 = '//*[@id="hfinGubun"]'
100
+
101
+ 가치분석연간 = '//*[@id="frqTyp0_2"]'
102
+ 가치분석분기 = '//*[@id="frqTyp1_2"]'
103
+ 가치분석검색 = '//*[@id="hfinGubun2"]'
104
+ '''
105
+
106
+
107
+ class C104AQ(C104Spider):
108
+ name = 'c104_aq'
109
+
110
+ def __init__(self, code, mongo_client):
111
+ super(C104AQ, self).__init__(code, mongo_client, title='수익성q_가치분석q')
112
+
113
+ def setting_page(self, url: str):
114
+ buttons = [
115
+ ('수익성', '//*[ @id="val_tab1"]'),
116
+ ('분기', '//*[@id="frqTyp1"]'),
117
+ ('검색', '//*[@id="hfinGubun"]'),
118
+ ('가치분석분기', '//*[@id="frqTyp1_2"]'),
119
+ ('가치분석검색', '//*[@id="hfinGubun2"]'),
120
+ ]
121
+ common.click_buttons(self.driver, url, buttons, self.WAIT)
122
+
123
+
124
+ class C104BQ(C104Spider):
125
+ name = 'c104_bq'
126
+
127
+ def __init__(self, code, mongo_client):
128
+ super(C104BQ, self).__init__(code, mongo_client, title='성장성q')
129
+
130
+ def setting_page(self, url: str):
131
+ buttons = [
132
+ ('성장성', '//*[ @id="val_tab2"]'),
133
+ ('분기', '//*[@id="frqTyp1"]'),
134
+ ('검색', '//*[@id="hfinGubun"]'),
135
+ ]
136
+ common.click_buttons(self.driver, url, buttons, self.WAIT)
137
+
138
+
139
+ class C104CQ(C104Spider):
140
+ name = 'c104_cq'
141
+
142
+ def __init__(self, code, mongo_client):
143
+ super(C104CQ, self).__init__(code, mongo_client, title='안정성q')
144
+
145
+ def setting_page(self, url: str):
146
+ buttons = [
147
+ ('안정성', '//*[ @id="val_tab3"]'),
148
+ ('분기', '//*[@id="frqTyp1"]'),
149
+ ('검색', '//*[@id="hfinGubun"]'),
150
+ ]
151
+ common.click_buttons(self.driver, url, buttons, self.WAIT)
152
+
153
+
154
+ class C104DQ(C104Spider):
155
+ name = 'c104_dq'
156
+
157
+ def __init__(self, code, mongo_client):
158
+ super(C104DQ, self).__init__(code, mongo_client, title='활동성q')
159
+
160
+ def setting_page(self, url: str):
161
+ buttons = [
162
+ ('활동성', '//*[ @id="val_tab4"]'),
163
+ ('분기', '//*[@id="frqTyp1"]'),
164
+ ('검색', '//*[@id="hfinGubun"]'),
165
+ ]
166
+ common.click_buttons(self.driver, url, buttons, self.WAIT)
167
+
168
+
169
+ class C104AY(C104Spider):
170
+ name = 'c104_ay'
171
+
172
+ def __init__(self, code, mongo_client):
173
+ super(C104AY, self).__init__(code, mongo_client, title='수익성y_가치분석y')
174
+
175
+ def setting_page(self, url: str):
176
+ buttons = [
177
+ ('수익성', '//*[ @id="val_tab1"]'),
178
+ ('연간', '//*[@id="frqTyp0"]'),
179
+ ('검색', '//*[@id="hfinGubun"]'),
180
+ ('가치분석연간', '//*[@id="frqTyp0_2"]'),
181
+ ('가치분석검색', '//*[@id="hfinGubun2"]'),
182
+ ]
183
+ common.click_buttons(self.driver, url, buttons, self.WAIT)
184
+
185
+
186
+ class C104BY(C104Spider):
187
+ name = 'c104_by'
188
+
189
+ def __init__(self, code, mongo_client):
190
+ super(C104BY, self).__init__(code, mongo_client, title='성장성y')
191
+
192
+ def setting_page(self, url: str):
193
+ buttons = [
194
+ ('성장성', '//*[ @id="val_tab2"]'),
195
+ ('연간', '//*[@id="frqTyp0"]'),
196
+ ('검색', '//*[@id="hfinGubun"]'),
197
+ ]
198
+ common.click_buttons(self.driver, url, buttons, self.WAIT)
199
+
200
+
201
+ class C104CY(C104Spider):
202
+ name = 'c104_cy'
203
+
204
+ def __init__(self, code, mongo_client):
205
+ super(C104CY, self).__init__(code, mongo_client, title='안정성y')
206
+
207
+ def setting_page(self, url: str):
208
+ buttons = [
209
+ ('안정성', '//*[ @id="val_tab3"]'),
210
+ ('연간', '//*[@id="frqTyp0"]'),
211
+ ('검색', '//*[@id="hfinGubun"]'),
212
+ ]
213
+ common.click_buttons(self.driver, url, buttons, self.WAIT)
214
+
215
+
216
+ class C104DY(C104Spider):
217
+ name = 'c104_dy'
218
+
219
+ def __init__(self, code, mongo_client):
220
+ super(C104DY, self).__init__(code, mongo_client, title='활동성y')
221
+
222
+ def setting_page(self, url: str):
223
+ buttons = [
224
+ ('활동성', '//*[ @id="val_tab4"]'),
225
+ ('연간', '//*[@id="frqTyp0"]'),
226
+ ('검색', '//*[@id="hfinGubun"]'),
227
+ ]
228
+ common.click_buttons(self.driver, url, buttons, self.WAIT)