scraping-rtn 0.0.1__tar.gz → 0.0.8.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,15 +1,15 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: scraping_rtn
3
- Version: 0.0.1
3
+ Version: 0.0.8.0
4
4
  Summary: package to scrape gymnastics data from Road To Nationals
5
5
  Author-email: Claire Harmon <ceharmon220@gmail.com>
6
- Project-URL: Homepage, https://github.com/cgn-charmon/scraping_rtn
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/ceharmon/scraping_rtn
7
8
  Classifier: Programming Language :: Python :: 3
8
9
  Classifier: License :: OSI Approved :: MIT License
9
10
  Classifier: Operating System :: OS Independent
10
11
  Requires-Python: >=3.9
11
12
  Description-Content-Type: text/markdown
12
- License-File: LICENSE
13
13
  Requires-Dist: pandas>=1.5.3
14
14
  Requires-Dist: numpy>=1.23.5
15
15
  Requires-Dist: requests>=2.28.1
@@ -1,21 +1,25 @@
1
1
  [build-system]
2
- requires = ["setuptools>=61.0"]
2
+ requires = ["setuptools>=69.0", "wheel"]
3
3
  build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "scraping_rtn"
7
- version = "0.0.1"
7
+ version = "0.0.8.0"
8
8
  authors = [
9
- { name="Claire Harmon", email="ceharmon220@gmail.com" },
9
+ { name = "Claire Harmon", email = "ceharmon220@gmail.com" },
10
10
  ]
11
11
  description = "package to scrape gymnastics data from Road To Nationals"
12
12
  readme = "README.md"
13
13
  requires-python = ">=3.9"
14
+
15
+ license = { text = "MIT" }
16
+
14
17
  dependencies = [
15
18
  "pandas >=1.5.3",
16
19
  "numpy >=1.23.5",
17
20
  "requests >=2.28.1"
18
21
  ]
22
+
19
23
  classifiers = [
20
24
  "Programming Language :: Python :: 3",
21
25
  "License :: OSI Approved :: MIT License",
@@ -23,4 +27,7 @@ classifiers = [
23
27
  ]
24
28
 
25
29
  [project.urls]
26
- "Homepage" = "https://github.com/cgn-charmon/scraping_rtn"
30
+ Homepage = "https://github.com/ceharmon/scraping_rtn"
31
+
32
+ [tool.setuptools]
33
+ license-files = []
@@ -1,11 +1,12 @@
1
- from .src import EVENT_MAP, EVENTS, get_data_from_api, fix_opponents, normalize_date, merge_dicts, get_extra_cols, \
2
- SCHEDULE_COLS, RESULTS_COLS, IND_RESULTS_COLS, ROSTER_COLS
1
+ from .src import EVENT_MAP, EVENTS, get_session, get_data_from_api, fix_opponents, normalize_date, merge_dicts, \
2
+ get_extra_cols, SCHEDULE_COLS, RESULTS_COLS, IND_RESULTS_COLS, ROSTER_COLS
3
3
  import pandas as pd
4
4
  import numpy as np
5
+ from datetime import datetime
5
6
 
6
7
 
7
8
  class RtnSingleTeamYear(object):
8
- def __init__(self, year, team_name, team_id=None):
9
+ def __init__(self, year, team_name, team_id=None, session=None):
9
10
  self.year = year
10
11
  if team_name is not None:
11
12
  self.team_name = team_name
@@ -14,33 +15,47 @@ class RtnSingleTeamYear(object):
14
15
  else:
15
16
  self.team_id = team_id
16
17
 
17
- def get_team_mapping(self):
18
- all_teams_data = get_data_from_api(endpoint='gymnasts2', suffix=str(self.year) + '/1').json()
18
+ if session is None:
19
+ self.session = get_session()
20
+ else:
21
+ self.session = session
22
+
23
+ def get_team_mapping(self, force_update=False):
24
+ if force_update:
25
+ get_data_from_api.cache_clear()
26
+
27
+ all_teams_data = get_data_from_api(endpoint='gymnasts2', suffix=str(self.year) + '/1', session=self.session).json()
19
28
  return {team['team_name']: team['id'] for team in all_teams_data['teams']}
20
29
 
21
30
  def get_team_id(self):
22
31
  if not hasattr(self, 'team_id_map'):
23
32
  self.team_id_map = self.get_team_mapping()
24
33
 
34
+ if self.team_name and self.team_name not in self.team_id_map.keys():
35
+ raise ValueError(f'Unknown team name: {self.team_name}')
36
+
25
37
  return self.team_id_map.get(self.team_name, -1)
26
38
  # if self.team_name in self.team_id_map.keys():
27
39
  # return self.team_id_map[self.team_name]
28
40
  # else:
29
41
  # raise ValueError(f'{self.team_name} does not exist in data for {self.year}')
30
42
 
31
- def _get_raw_roster(self):
43
+ def _get_raw_roster(self, force_update=False):
32
44
  rename_map = {'id': 'Gymnast ID', 'hometown': 'Hometown', 'school_year': 'School Year', 'events': 'Events'}
33
45
  school_year_map = {'1': 'FR', '2': 'SO', '3': 'JR', '4': 'SR'}
34
46
 
35
- roster_data = get_data_from_api(endpoint='rostermain', suffix=str(self.year)+'/'+str(self.team_id)+'/1').json()
47
+ if force_update:
48
+ get_data_from_api.cache_clear()
36
49
 
37
- self._raw_roster = [{**{rename_map.get(k, k): v if k != 'school_year' else school_year_map[v] for k, v in data.items()},
50
+ roster_data = get_data_from_api(endpoint='rostermain', suffix=str(self.year)+'/'+str(self.team_id)+'/1', session=self.session).json()
51
+
52
+ self._raw_roster = [{**{rename_map.get(k, k): v if k != 'school_year' else school_year_map.get(v, '') for k, v in data.items()},
38
53
  **{'Name': data['fname'] + ' ' + data['lname'], 'Team': self.team_name}}
39
54
  for data in roster_data]
40
55
 
41
- def get_roster(self, include_hometowns=False, include_class=False, include_events=False):
56
+ def get_roster(self, include_hometowns=False, include_class=False, include_events=False, force_update=False):
42
57
  if not hasattr(self, 'raw_roster'):
43
- self._get_raw_roster()
58
+ self._get_raw_roster(force_update=force_update)
44
59
 
45
60
  extra_cols = get_extra_cols(include_hometowns=include_hometowns, include_class=include_class, include_events=include_events)
46
61
 
@@ -52,20 +67,23 @@ class RtnSingleTeamYear(object):
52
67
 
53
68
  return self.roster
54
69
 
55
- def _get_raw_season_results(self):
56
- meets = get_data_from_api(endpoint='dashboard', suffix=str(self.year)+'/'+str(self.team_id)).json()
70
+ def _get_raw_season_results(self, force_update=False):
71
+ if force_update:
72
+ get_data_from_api.cache_clear()
73
+
74
+ meets = get_data_from_api(endpoint='dashboard', suffix=str(self.year)+'/'+str(self.team_id), session=self.session).json()
57
75
  name_map = {'team_id': 'Team ID', 'team_name': 'Team', 'meet_id': 'Team Meet ID',
58
76
  'meet_date': 'Meet Date', 'team_score': 'Score', 'home': 'Home/Away',
59
77
  'opponent': 'Opponents', 'meet_desc': 'Meet Name', 'linked_id': 'Meet ID'}
60
78
 
61
79
  self._raw_season_results = [{name_map.get(k, k): fix_opponents(v) if k == 'opponent'
62
80
  else (normalize_date(v) if k == 'meet_date' else v)
63
- for k, v in data.items() if k != 'jas'} for data in meets['meets']]
81
+ for k, v in data.items() if k != 'jas'} for data in meets['meets'] if data['team_name'] == self.team_name]
64
82
  self._raw_schedule = [{k: v for k, v in data.items() if k not in ('Score', 'VT', 'UB', 'BB', 'FX')} for data in self._raw_season_results]
65
83
 
66
- def get_schedule(self):
84
+ def get_schedule(self, force_update=False):
67
85
  if not hasattr(self, '_raw_schedule'):
68
- self._get_raw_season_results()
86
+ self._get_raw_season_results(force_update=force_update)
69
87
 
70
88
  if len(self._raw_schedule) > 0:
71
89
  return pd.DataFrame(self._raw_schedule)
@@ -87,29 +105,32 @@ class RtnSingleTeamYear(object):
87
105
  * Uses team meet id to join back to meet info, such as opponent, etc.
88
106
  """
89
107
  if not hasattr(self, '_raw_season_results'):
90
- self._get_raw_season_results()
108
+ self._get_raw_season_results(force_update=force_update)
91
109
 
92
110
  if len(self._raw_season_results) > 0:
93
111
  if (len({'VT', 'UB', 'BB', 'FX'}.intersection(self._raw_season_results[0].keys())) != 4 or force_update):
94
112
  if method == 'team_consistency':
95
- self._team_event_scores_team_consistency()
113
+ self._team_event_scores_team_consistency(force_update=force_update)
96
114
  elif method == 'by_meet':
97
- self._team_event_scores_by_meet()
115
+ self._team_event_scores_by_meet(force_update=force_update)
98
116
  else:
99
117
  raise ValueError('Method must be "team_consistency" or "by_meet"')
100
118
 
101
119
  # TODO: different way to drop duplicates?
102
- self.season_results = pd.DataFrame(self._raw_season_results).drop_duplicates()
120
+ self.season_results = pd.DataFrame(self._raw_season_results).dropna(subset=['Score']).drop_duplicates()
103
121
  else:
104
122
  self.season_results = pd.DataFrame(columns=SCHEDULE_COLS + RESULTS_COLS)
105
123
 
106
124
  return self.season_results
107
125
 
108
- def _team_event_scores_by_meet(self):
126
+ def _team_event_scores_by_meet(self, force_update=False):
109
127
  team_scores_all = []
110
- for meet_id in [data['Team Meet ID'] for data in self._raw_season_results]:
128
+ for meet_id in [data['Team Meet ID'] for data in self._raw_season_results if data['Meet Date'] <= datetime.now()]:
111
129
  try:
112
- meet_res = get_data_from_api(endpoint='meetresults', suffix=str(meet_id)).json()
130
+ if force_update:
131
+ get_data_from_api.cache_clear()
132
+
133
+ meet_res = get_data_from_api(endpoint='meetresults', suffix=str(meet_id), session=self.session).json()
113
134
  # This API call returns scores from all teams at this meet, not just this team. Need to pick out correct score
114
135
  team_scores = [score for score in meet_res['teams'] if score['tname'] == self.team_name and score['mid'] == str(meet_id)]
115
136
  assert len(team_scores) == 1, 'Multiple team scores??'
@@ -124,10 +145,13 @@ class RtnSingleTeamYear(object):
124
145
  for i in range(len(self._raw_season_results)):
125
146
  self._raw_season_results[i].update({'VT': np.nan, 'UB': np.nan, 'BB': np.nan, 'FX': np.nan})
126
147
 
127
- def _team_event_scores_team_consistency(self):
128
- res = get_data_from_api(endpoint='teamConsistency', suffix=f'{self.year}/{self.team_id}').json()
148
+ def _team_event_scores_team_consistency(self, force_update=False):
149
+ if force_update:
150
+ get_data_from_api.cache_clear()
151
+
152
+ res = get_data_from_api(endpoint='teamConsistency', suffix=f'{self.year}/{self.team_id}', session=self.session).json()
129
153
  if len(res['labels']) == 0:
130
- print(f'No team consistency data found for year {self.year}')
154
+ print(f'No team consistency data found for {self.team_name} in {self.year}')
131
155
  for i in range(len(self._raw_season_results)):
132
156
  self._raw_season_results[i].update({'VT': np.nan, 'UB': np.nan, 'BB': np.nan, 'FX': np.nan})
133
157
  else:
@@ -145,12 +169,10 @@ class RtnSingleTeamYear(object):
145
169
  Methods:
146
170
  * Individual Consistency - Uses Individual Consistency tab from RTN
147
171
  * Tends to have more complete data, especially for older years
148
- * Requires summing of all events to get AA (code does this for you)
149
172
  * Relies on date to join back to meet info, such as opponent, etc.
150
173
  * One API call per gymnast, relative speed depends on number of meets vs number of gymnasts
151
174
  * By Meet - loops through each meet to get scores
152
175
  * Older meets tend to be missing
153
- * Includes AA scores in the response
154
176
  * Uses team meet id to join back to meet info, such as opponent, etc.
155
177
  * One API call per meet, relative speed depends on number of meets vs number of gymnasts
156
178
  """
@@ -162,20 +184,31 @@ class RtnSingleTeamYear(object):
162
184
  if not hasattr(self, '_raw_roster'):
163
185
  self.get_roster()
164
186
 
165
- self._individual_scores_individual_consistency()
187
+ self._individual_scores_individual_consistency(force_update=force_update)
166
188
  elif method == 'by_meet':
167
- self._individual_scores_by_meet()
189
+ self._individual_scores_by_meet(force_update=force_update)
168
190
  else:
169
191
  raise ValueError('Method must be "individual_consistency" or "by_meet"')
170
192
 
171
193
  return self.individual_results
172
194
 
173
- def _individual_scores_by_meet(self):
195
+ def _individual_scores_by_meet(self, force_update=False):
174
196
  individual_scores_all = []
175
- for meet_id in [meet['Team Meet ID'] for meet in self._raw_schedule]:
197
+ for meet_id in [meet['Team Meet ID'] for meet in self._raw_schedule if meet['Meet Date'] <= datetime.now()]:
176
198
  try:
177
- meet_res = get_data_from_api(endpoint='meetresults', suffix=str(meet_id)).json()
178
- team_inds = [ind for ind, scores in enumerate(meet_res['scores']) if len(scores) > 0 and scores[0]['team_name'] == self.team_name]
199
+ if force_update:
200
+ get_data_from_api.cache_clear()
201
+
202
+ meet_res = get_data_from_api(endpoint='meetresults', suffix=str(meet_id), session=self.session).json()
203
+ if len(meet_res) == 0 or len(meet_res['scores']) == 0 or len(meet_res['scores'][0]) == 0:
204
+ print(f'No data found for meet {meet_id}')
205
+ continue
206
+
207
+ if 'team_name' in meet_res['scores'][0][0]:
208
+ team_inds = [ind for ind, scores in enumerate(meet_res['scores']) if len(scores) > 0 and scores[0]['team_name'] == self.team_name]
209
+ else:
210
+ raise ValueError('Key not found')
211
+
179
212
  if len(team_inds) == 0:
180
213
  print(f'No scores found at meet {meet_id}')
181
214
  continue
@@ -195,16 +228,20 @@ class RtnSingleTeamYear(object):
195
228
  if len(individual_scores_all) > 0:
196
229
  merge_dicts(dict1=individual_scores_all, dict2=self._raw_schedule, merge_field='Team Meet ID')
197
230
  self.individual_results = pd.DataFrame(individual_scores_all)
231
+ self.individual_results['AA'] = self.individual_results[['VT', 'UB', 'BB', 'FX']].dropna(how='any').astype(float).T.sum().round(4)
198
232
  else:
199
233
  self.individual_results = pd.DataFrame(columns=['Meet Date', 'VT', 'UB', 'BB', 'FX', 'AA', 'Gymnast ID', 'Name',
200
234
  'Team ID', 'Team', 'Team Meet ID', 'Home/Away', 'Opponents',
201
235
  'Meet Name', 'Meet ID'])
202
236
 
203
- def _individual_scores_individual_consistency(self):
237
+ def _individual_scores_individual_consistency(self, force_update=False):
204
238
  ind_consistency_all = []
205
239
  for gymnast in self._raw_roster:
206
240
  try:
207
- res = get_data_from_api(endpoint='indConsistency', suffix=f"{self.year}/{gymnast['Gymnast ID']}").json()
241
+ if force_update:
242
+ get_data_from_api.cache_clear()
243
+
244
+ res = get_data_from_api(endpoint='indConsistency', suffix=f"{self.year}/{gymnast['Gymnast ID']}", session=self.session).json()
208
245
  ind_consistency = [{'Meet Date': normalize_date(res['labels'][i][:7] + str(self.year), dt_format='%b-%d-%Y'),
209
246
  'VT': round(float(res['vts'][i]), 4) if res['vts'][i] is not None else np.nan,
210
247
  'UB': round(float(res['ubs'][i]), 4) if res['ubs'][i] is not None else np.nan,
@@ -225,24 +262,26 @@ class RtnSingleTeamYear(object):
225
262
  else:
226
263
  self.individual_results = pd.DataFrame(columns=SCHEDULE_COLS + IND_RESULTS_COLS)
227
264
 
228
- def get_individual_nqs(self):
265
+ def get_individual_nqs(self, force_update=False):
229
266
  if not hasattr(self, '_raw_roster'):
230
- self._get_raw_roster()
267
+ self._get_raw_roster(force_update=force_update)
231
268
 
232
269
  if not hasattr(self, '_raw_individual_nqs'):
233
- self._get_raw_individual_nqs()
270
+ self._get_raw_individual_nqs(force_update=force_update)
234
271
 
235
272
  if len(self._raw_individual_nqs) > 0:
236
273
  return pd.DataFrame(self._raw_individual_nqs)
237
274
  else:
238
275
  return pd.DataFrame(columns=ROSTER_COLS + EVENTS) # + ['AA'])
239
276
 
240
- def _get_raw_individual_nqs(self):
277
+ def _get_raw_individual_nqs(self, force_update=False):
241
278
  name_map = {'maxv': 'VT', 'maxub': 'UB', 'maxbb': 'BB', 'maxfx': 'FX',
242
279
  # 'maxaa': 'AA',
243
280
  'gid': 'Gymnast ID'}
281
+ if force_update:
282
+ get_data_from_api.cache_clear()
244
283
 
245
- nqsData = get_data_from_api(endpoint='rostermain', suffix=f'{self.year}/{self.team_id}/4').json()
284
+ nqsData = get_data_from_api(endpoint='rostermain', suffix=f'{self.year}/{self.team_id}/4', session=self.session).json()
246
285
  ind_nqs = [{name_map[k]: round(float(v), 4) if k != 'gid' and v != '' else (np.nan if k != 'gid' else v)
247
286
  for k, v in data.items() if k in name_map.keys()} for data in nqsData['ind']]
248
287
 
@@ -253,18 +292,25 @@ class RtnSingleTeamYear(object):
253
292
  else:
254
293
  self._raw_individual_nqs = []
255
294
 
256
- def _get_current_week(self):
295
+ def _get_current_week(self, force_update=False):
257
296
  if not hasattr(self, 'week'):
258
- return get_data_from_api(endpoint='currentweek', suffix=str(self.year)).json()['max']
297
+ if force_update:
298
+ get_data_from_api.cache_clear()
259
299
 
260
- def _get_raw_rankings(self, team_vs_ind, event, week):
300
+ week_data = get_data_from_api(endpoint='currentweek', suffix=str(self.year), session=self.session).json()
301
+ return min(int(week_data['week']), int(week_data['max']))
302
+
303
+ def _get_raw_rankings(self, team_vs_ind, event, week, force_update=False):
261
304
  team_ind_map = {'team': 0, 'ind': 1}
262
305
  event_api_map = {'VT': 1, 'UB': 2, 'BB': 3, 'FX': 4, 'AA': 5}
263
306
  rename_map = {'rank': 'Rank', 'gid': 'Gymnast ID', 'team': 'Team', 'tid': 'Team ID',
264
307
  'rqs': 'NQS', 'reg': 'Region', 'con': 'Conference', 'div': 'Division',
265
308
  'usag': 'USAG', 'ave': 'Average', 'high': 'High', 'name': 'Team'}
266
309
 
267
- res = get_data_from_api(endpoint='results', suffix=f'{self.year}/{week}/{team_ind_map[team_vs_ind]}/{event_api_map[event]}').json()
310
+ if force_update:
311
+ get_data_from_api.cache_clear()
312
+
313
+ res = get_data_from_api(endpoint='results', suffix=f'{self.year}/{week}/{team_ind_map[team_vs_ind]}/{event_api_map[event]}', session=self.session).json()
268
314
  if team_vs_ind == 'ind':
269
315
  self._raw_rankings[team_vs_ind][event] = [{**{rename_map.get(k): float(v) if k in ['rqs', 'ave', 'high'] else v for k, v in data.items() if k in rename_map},
270
316
  **{'Name': data['fname'] + ' ' + data['lname'], 'Event': event}}
@@ -274,9 +320,9 @@ class RtnSingleTeamYear(object):
274
320
  **{'Event': event}}
275
321
  for data in res['data']]
276
322
 
277
- def get_overall_rankings(self, team_vs_ind='team', event='AA', week=None):
323
+ def get_overall_rankings(self, team_vs_ind='team', event='AA', week=None, force_update=False):
278
324
  if not week:
279
- week = self._get_current_week()
325
+ week = self._get_current_week(force_update=force_update)
280
326
 
281
327
  if not hasattr(self, '_raw_rankings'):
282
328
  self._raw_rankings = {'team': {event: None for event in EVENT_MAP.values()},
@@ -288,7 +334,7 @@ class RtnSingleTeamYear(object):
288
334
  'Division', 'Conference', 'Region', 'USAG']}
289
335
 
290
336
  if self._raw_rankings[team_vs_ind][event] is None:
291
- self._get_raw_rankings(team_vs_ind=team_vs_ind, event=event, week=week)
337
+ self._get_raw_rankings(team_vs_ind=team_vs_ind, event=event, week=week, force_update=force_update)
292
338
 
293
339
  return pd.DataFrame(self._raw_rankings[team_vs_ind][event])[col_orders[team_vs_ind]]
294
340
 
@@ -0,0 +1,107 @@
1
+ from .RtnSingleTeamYear import RtnSingleTeamYear
2
+ from .src import get_session, validate_input, get_extra_cols, SCHEDULE_COLS, RESULTS_COLS, IND_RESULTS_COLS, EVENTS, ROSTER_COLS
3
+ import pandas as pd
4
+
5
+ BLANK_SPACES = ' '*30
6
+
7
+
8
+ def save(df, filename):
9
+ df.to_csv(filename, index=False)
10
+
11
+
12
+ def all_teams(year, force_update=False):
13
+ rtn = RtnSingleTeamYear(year=year, team_name=None)
14
+ return list(rtn.get_team_mapping(force_update=force_update).keys())
15
+
16
+
17
+ def roster(year, teams, include_hometowns=False, include_class=False, include_events=False, verbose=False, force_update=False):
18
+ teams = validate_input(teams)
19
+ session = get_session()
20
+
21
+ all_rosters = []
22
+ for i, team in enumerate(teams):
23
+ if verbose:
24
+ print(f'Getting roster for {team}{BLANK_SPACES}', end='\r' if team != teams[-1] else None)
25
+ rtn = RtnSingleTeamYear(year=year, team_name=team, session=session)
26
+ res = rtn.get_roster(include_hometowns=include_hometowns, include_class=include_class,
27
+ include_events=include_events, force_update=force_update if i == 0 else False)
28
+ if verbose and len(res) == 0:
29
+ print(f'\tNo roster found for {team}')
30
+ all_rosters.append(res)
31
+
32
+ extra_cols = get_extra_cols(include_hometowns=include_hometowns, include_class=include_class, include_events=include_events)
33
+ return pd.concat(all_rosters)[ROSTER_COLS + extra_cols]
34
+
35
+
36
+ def schedule(year, teams, verbose=False, force_update=False):
37
+ teams = validate_input(teams)
38
+ session = get_session()
39
+
40
+ all_schedules = []
41
+ for i, team in enumerate(teams):
42
+ if verbose:
43
+ print(f'Getting schedule for {team}{BLANK_SPACES}', end='\r' if team != teams[-1] else None)
44
+ rtn = RtnSingleTeamYear(year=year, team_name=team, session=session)
45
+ res = rtn.get_schedule(force_update=force_update if i == 0 else False)
46
+ if verbose and len(res) == 0:
47
+ print(f'\tNo schedule found for {team}')
48
+ all_schedules.append(res)
49
+
50
+ return pd.concat(all_schedules)[SCHEDULE_COLS]
51
+
52
+
53
+ def team_results(year, teams, method='team_consistency', force_update=False, verbose=False):
54
+ teams = validate_input(teams)
55
+ session = get_session()
56
+
57
+ all_results = []
58
+ for i, team in enumerate(teams):
59
+ if verbose:
60
+ print(f'Getting schedule and results for {team}{BLANK_SPACES}', end='\r' if team != teams[-1] else None)
61
+ rtn = RtnSingleTeamYear(year=year, team_name=team, session=session)
62
+ res = rtn.get_team_scores(method=method, force_update=force_update if i == 0 else False)
63
+ if verbose and len(res) == 0:
64
+ print(f'\tNo schedule and results found for {team}')
65
+ all_results.append(res)
66
+
67
+ return pd.concat(all_results)[SCHEDULE_COLS + RESULTS_COLS]
68
+
69
+
70
+ def individual_results(year, teams, method='by_meet', force_update=False, verbose=False):
71
+ teams = validate_input(teams)
72
+ session = get_session()
73
+
74
+ all_scores = []
75
+ for i, team in enumerate(teams):
76
+ if verbose:
77
+ print(f'Getting scores for {team}{BLANK_SPACES}', end='\r' if team != teams[-1] else None)
78
+ rtn = RtnSingleTeamYear(year=year, team_name=team, session=session)
79
+ res = rtn.get_individual_scores(method=method, force_update=force_update if i == 0 else False)
80
+ if verbose and len(res) == 0:
81
+ print(f'\tNo scores found for {team}')
82
+ all_scores.append(res)
83
+
84
+ return pd.concat(all_scores)[SCHEDULE_COLS + IND_RESULTS_COLS]
85
+
86
+
87
+ def individual_nqs(year, teams, verbose=False, force_update=False):
88
+ teams = validate_input(teams)
89
+ session = get_session()
90
+
91
+ all_nqs = []
92
+ for i, team in enumerate(teams):
93
+ if verbose:
94
+ print(f'Getting individual NQS for {team}{BLANK_SPACES}', end='\r' if team != teams[-1] else None)
95
+ rtn = RtnSingleTeamYear(year=year, team_name=team, session=session)
96
+ res = rtn.get_individual_nqs(force_update=force_update if i == 0 else False)
97
+ if verbose and len(res) == 0:
98
+ print(f'\tNo individual NQS found for {team}')
99
+ all_nqs.append(res)
100
+
101
+ return pd.concat(all_nqs)[ROSTER_COLS + EVENTS] # + ['AA']]
102
+
103
+
104
+ def rankings(year, team_vs_ind='team', event='AA', week=None, force_update=False):
105
+ session = get_session()
106
+ rtn = RtnSingleTeamYear(year=year, team_name=None, session=session)
107
+ return rtn.get_overall_rankings(team_vs_ind=team_vs_ind, event=event, week=week, force_update=force_update)
@@ -1,4 +1,6 @@
1
1
  import requests
2
+ from requests.adapters import HTTPAdapter
3
+ from urllib3.util.retry import Retry
2
4
  from functools import lru_cache
3
5
  from datetime import datetime
4
6
 
@@ -19,16 +21,32 @@ def validate_input(teams):
19
21
  return teams
20
22
 
21
23
 
24
+ def get_session():
25
+ session = requests.Session()
26
+ retry = Retry(connect=3, backoff_factor=0.5)
27
+ adapter = HTTPAdapter(max_retries=retry)
28
+ session.mount('http://', adapter)
29
+ session.mount('https://', adapter)
30
+ return session
31
+
32
+
22
33
  @lru_cache(maxsize=1000000)
23
- def get_data_from_api(endpoint, suffix):
34
+ def get_data_from_api(endpoint, suffix, session=None):
35
+ if not session:
36
+ session = get_session()
37
+
24
38
  url = 'https://www.roadtonationals.com/api/women/' + endpoint
25
39
  if suffix:
26
40
  url += '/' + suffix
27
- return requests.get(url)
41
+
42
+ return session.get(url)
28
43
 
29
44
 
30
45
  def fix_opponents(ops):
31
- ops = (ops.replace(', ', '/').replace(' and ', '/').replace(' @ ', '/').replace(' w/ ', '/').replace(' with ', '/'))
46
+ if not isinstance(ops, str):
47
+ return ops
48
+
49
+ ops = (ops.replace(', ', '/').replace(',','/').replace(' and ', '/').replace(' @ ', '/').replace(' w/ ', '/').replace(' with ', '/'))
32
50
 
33
51
  if 'william & mary' in ops.lower():
34
52
  # Todo: title case is going to mess with other opponents here, full mapping might fix that
@@ -1,15 +1,15 @@
1
- Metadata-Version: 2.1
2
- Name: scraping-rtn
3
- Version: 0.0.1
1
+ Metadata-Version: 2.4
2
+ Name: scraping_rtn
3
+ Version: 0.0.8.0
4
4
  Summary: package to scrape gymnastics data from Road To Nationals
5
5
  Author-email: Claire Harmon <ceharmon220@gmail.com>
6
- Project-URL: Homepage, https://github.com/cgn-charmon/scraping_rtn
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/ceharmon/scraping_rtn
7
8
  Classifier: Programming Language :: Python :: 3
8
9
  Classifier: License :: OSI Approved :: MIT License
9
10
  Classifier: Operating System :: OS Independent
10
11
  Requires-Python: >=3.9
11
12
  Description-Content-Type: text/markdown
12
- License-File: LICENSE
13
13
  Requires-Dist: pandas>=1.5.3
14
14
  Requires-Dist: numpy>=1.23.5
15
15
  Requires-Dist: requests>=2.28.1
@@ -1,4 +1,3 @@
1
- LICENSE
2
1
  README.md
3
2
  pyproject.toml
4
3
  src/scraping_rtn/RtnSingleTeamYear.py
@@ -1,21 +0,0 @@
1
- MIT License
2
-
3
- Copyright (c) 2023 Claire Harmon
4
-
5
- Permission is hereby granted, free of charge, to any person obtaining a copy
6
- of this software and associated documentation files (the "Software"), to deal
7
- in the Software without restriction, including without limitation the rights
8
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
- copies of the Software, and to permit persons to whom the Software is
10
- furnished to do so, subject to the following conditions:
11
-
12
- The above copyright notice and this permission notice shall be included in all
13
- copies or substantial portions of the Software.
14
-
15
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
- SOFTWARE.
@@ -1,99 +0,0 @@
1
- from .RtnSingleTeamYear import RtnSingleTeamYear
2
- from .src import validate_input, get_extra_cols, SCHEDULE_COLS, RESULTS_COLS, IND_RESULTS_COLS, EVENTS, ROSTER_COLS
3
- import pandas as pd
4
-
5
-
6
- def save(df, filename):
7
- df.to_csv(filename, index=False)
8
-
9
-
10
- def all_teams(year):
11
- rtn = RtnSingleTeamYear(year=year, team_name=None)
12
- return list(rtn.get_team_mapping().keys())
13
-
14
-
15
- def roster(year, teams, include_hometowns=False, include_class=False, include_events=False, verbose=False):
16
- teams = validate_input(teams)
17
-
18
- all_rosters = []
19
- for team in teams:
20
- if verbose:
21
- print(f'Getting roster for {team} ', end='\r')
22
- rtn = RtnSingleTeamYear(year=year, team_name=team)
23
- res = rtn.get_roster(include_hometowns=include_hometowns, include_class=include_class,
24
- include_events=include_events)
25
- if verbose and len(res) == 0:
26
- print(f'\tNo roster found for {team}')
27
- all_rosters.append(res)
28
-
29
- extra_cols = get_extra_cols(include_hometowns=include_hometowns, include_class=include_class, include_events=include_events)
30
- return pd.concat(all_rosters)[ROSTER_COLS + extra_cols]
31
-
32
-
33
- def schedule(year, teams, verbose=False):
34
- teams = validate_input(teams)
35
-
36
- all_schedules = []
37
- for team in teams:
38
- if verbose:
39
- print(f'Getting schedule for {team} ', end='\r')
40
- rtn = RtnSingleTeamYear(year=year, team_name=team)
41
- res = rtn.get_schedule()
42
- if verbose and len(res) == 0:
43
- print(f'\tNo schedule found for {team}')
44
- all_schedules.append(res)
45
-
46
- return pd.concat(all_schedules)[SCHEDULE_COLS]
47
-
48
-
49
- def team_results(year, teams, method='team_consistency', force_update=False, verbose=False):
50
- teams = validate_input(teams)
51
-
52
- all_results = []
53
- for team in teams:
54
- if verbose:
55
- print(f'Getting schedule and results for {team} ', end='\r')
56
- rtn = RtnSingleTeamYear(year=year, team_name=team)
57
- res = rtn.get_team_scores(method=method,force_update=force_update)
58
- if verbose and len(res) == 0:
59
- print(f'\tNo schedule and results found for {team}')
60
- all_results.append(res)
61
-
62
- return pd.concat(all_results)[SCHEDULE_COLS + RESULTS_COLS]
63
-
64
-
65
- def individual_results(year, teams, method='by_meet', force_update=False, verbose=False):
66
- teams = validate_input(teams)
67
-
68
- all_scores = []
69
- for team in teams:
70
- if verbose:
71
- print(f'Getting scores for {team} ', end='\r')
72
- rtn = RtnSingleTeamYear(year=year, team_name=team)
73
- res = rtn.get_individual_scores(method=method, force_update=force_update)
74
- if verbose and len(res) == 0:
75
- print(f'\tNo scores found for {team}')
76
- all_scores.append(res)
77
-
78
- return pd.concat(all_scores)[SCHEDULE_COLS + IND_RESULTS_COLS]
79
-
80
-
81
- def individual_nqs(year, teams, verbose=False):
82
- teams = validate_input(teams)
83
-
84
- all_nqs = []
85
- for team in teams:
86
- if verbose:
87
- print(f'Getting individual NQS for {team} ', end='\r')
88
- rtn = RtnSingleTeamYear(year=year, team_name=team)
89
- res = rtn.get_individual_nqs()
90
- if verbose and len(res) == 0:
91
- print(f'\tNo individual NQS found for {team}')
92
- all_nqs.append(res)
93
-
94
- return pd.concat(all_nqs)[ROSTER_COLS + EVENTS] # + ['AA']]
95
-
96
-
97
- def rankings(year, team_vs_ind='team', event='AA', week=None):
98
- rtn = RtnSingleTeamYear(year=year, team_name=None)
99
- return rtn.get_overall_rankings(team_vs_ind=team_vs_ind, event=event, week=week)
File without changes
File without changes