TopDownHockey-Scraper 5.0.1__py3-none-any.whl → 6.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of TopDownHockey-Scraper might be problematic. Click here for more details.

@@ -19,6 +19,45 @@ import xmltodict
19
19
  from xml.parsers.expat import ExpatError
20
20
  from requests.exceptions import ChunkedEncodingError
21
21
 
22
+ print('Successfully did local install plus update')
23
+
24
+ team_names = ['ANAHEIM DUCKS',
25
+ 'ARIZONA COYOTES',
26
+ 'ATLANTA THRASHERS',
27
+ 'BOSTON BRUINS',
28
+ 'BUFFALO SABRES',
29
+ 'CALGARY FLAMES',
30
+ 'CHICAGO BLACKHAWKS',
31
+ 'COLORADO AVALANCHE',
32
+ 'COLUMBUS BLUE JACKETS',
33
+ 'DALLAS STARS',
34
+ 'DETROIT RED WINGS',
35
+ 'EDMONTON OILERS',
36
+ 'FLORIDA PANTHERS',
37
+ 'LOS ANGELES KINGS',
38
+ 'MINNESOTA WILD',
39
+ 'MONTRÉAL CANADIENS',
40
+ 'MONTREAL CANADIENS',
41
+ 'NASHVILLE PREDATORS',
42
+ 'NEW JERSEY DEVILS',
43
+ 'NEW YORK ISLANDERS',
44
+ 'NEW YORK RANGERS',
45
+ 'OTTAWA SENATORS',
46
+ 'PHILADELPHIA FLYERS',
47
+ 'PITTSBURGH PENGUINS',
48
+ 'PHOENIX COYOTES',
49
+ 'CAROLINA HURRICANES',
50
+ 'SAN JOSE SHARKS',
51
+ 'ST. LOUIS BLUES',
52
+ 'TAMPA BAY LIGHTNING',
53
+ 'TORONTO MAPLE LEAFS',
54
+ 'UTAH MAMMOTH',
55
+ 'VANCOUVER CANUCKS',
56
+ 'VEGAS GOLDEN KNIGHTS',
57
+ 'WASHINGTON CAPITALS',
58
+ 'WINNIPEG JETS',
59
+ 'SEATTLE KRAKEN']
60
+
22
61
  # ewc stands for "Events we care about."
23
62
 
24
63
  ewc = ['SHOT', 'HIT', 'BLOCK', 'MISS', 'GIVE', 'TAKE', 'GOAL']
@@ -231,16 +270,17 @@ def group_if_not_none(result):
231
270
  return(result)
232
271
 
233
272
  def scrape_html_roster(season, game_id):
273
+
234
274
  url = 'http://www.nhl.com/scores/htmlreports/' + season + '/RO0' + game_id + '.HTM'
235
275
  page = requests.get(url)
236
276
  soup = BeautifulSoup(page.content.decode('ISO-8859-1'), 'lxml', multi_valued_attributes = None)
237
-
277
+
238
278
  teamsoup = soup.find_all('td', {'align':'center', 'class':['teamHeading + border', 'teamHeading + border '], 'width':'50%'})
239
279
  away_team = teamsoup[0].get_text()
240
280
  home_team = teamsoup[1].get_text()
241
-
281
+
242
282
  home_player_soup = (soup.find_all('table', {'align':'center', 'border':'0', 'cellpadding':'0',
243
- 'cellspacing':'0', 'width':'100%', 'xmlns:ext':''}))[1].find_all('td')
283
+ 'cellspacing':'0', 'width':'100%'}))[2].find_all('td')
244
284
 
245
285
  length = int(len(home_player_soup)/3)
246
286
 
@@ -251,7 +291,7 @@ def scrape_html_roster(season, game_id):
251
291
  home_player_df = home_player_df.drop(0).assign(team = 'home', team_name = home_team)
252
292
 
253
293
  away_player_soup = (soup.find_all('table', {'align':'center', 'border':'0', 'cellpadding':'0',
254
- 'cellspacing':'0', 'width':'100%', 'xmlns:ext':''}))[0].find_all('td')
294
+ 'cellspacing':'0', 'width':'100%', }))[1].find_all('td')
255
295
 
256
296
  length = int(len(away_player_soup)/3)
257
297
 
@@ -260,15 +300,15 @@ def scrape_html_roster(season, game_id):
260
300
  away_player_df.columns = away_player_df.iloc[0]
261
301
 
262
302
  away_player_df = away_player_df.drop(0).assign(team = 'away', team_name = away_team)
263
-
303
+
264
304
  #global home_scratch_soup
265
-
305
+
266
306
  if len(soup.find_all('table', {'align':'center', 'border':'0', 'cellpadding':'0',
267
- 'cellspacing':'0', 'width':'100%', 'xmlns:ext':''}))>3:
307
+ 'cellspacing':'0', 'width':'100%', }))>3:
268
308
 
269
309
  home_scratch_soup = (soup.find_all('table', {'align':'center', 'border':'0', 'cellpadding':'0',
270
- 'cellspacing':'0', 'width':'100%', 'xmlns:ext':''}))[3].find_all('td')
271
-
310
+ 'cellspacing':'0', 'width':'100%', }))[4].find_all('td')
311
+
272
312
  if len(home_scratch_soup)>1:
273
313
 
274
314
  length = int(len(home_scratch_soup)/3)
@@ -284,10 +324,10 @@ def scrape_html_roster(season, game_id):
284
324
  home_scratch_df = pd.DataFrame()
285
325
 
286
326
  if len(soup.find_all('table', {'align':'center', 'border':'0', 'cellpadding':'0',
287
- 'cellspacing':'0', 'width':'100%', 'xmlns:ext':''}))>2:
288
-
327
+ 'cellspacing':'0', 'width':'100%', }))>2:
328
+
289
329
  away_scratch_soup = (soup.find_all('table', {'align':'center', 'border':'0', 'cellpadding':'0',
290
- 'cellspacing':'0', 'width':'100%', 'xmlns:ext':''}))[2].find_all('td')
330
+ 'cellspacing':'0', 'width':'100%', }))[3].find_all('td')
291
331
 
292
332
  if len(away_scratch_soup)>1:
293
333
 
@@ -306,31 +346,34 @@ def scrape_html_roster(season, game_id):
306
346
  player_df = pd.concat([home_player_df, away_player_df]).assign(status = 'player')
307
347
  scratch_df = pd.concat([home_scratch_df, away_scratch_df]).assign(status = 'scratch')
308
348
  roster_df = pd.concat([player_df, scratch_df])
309
-
349
+
310
350
  roster_df = roster_df.assign(team = np.where(roster_df.team=='CANADIENS MONTREAL', 'MONTREAL CANADIENS', roster_df.team))
311
-
351
+
352
+ roster_df = roster_df.assign(team = np.where(roster_df.team=='MONTRÉAL CANADIENS', 'MONTREAL CANADIENS', roster_df.team))
353
+
312
354
  # FIX NAMES
313
355
 
314
356
  roster_df = roster_df.rename(columns = {'Nom/Name':'Name'})
315
-
316
- roster_df.Name = roster_df.Name.str.split('(').str[0].str.strip()
317
-
357
+
358
+ roster_df.Name = roster_df.Name.apply(lambda x: re.sub(r' \(A\)$', '', x).strip())
359
+ roster_df.Name = roster_df.Name.apply(lambda x: re.sub(r' \(C\)$', '', x).strip())
360
+
318
361
  # Max Pacioretty doesn't exist in ESPN in 2009-2010, sadly.
319
-
362
+
320
363
  roster_df['Name'] = np.where(roster_df['Name'].str.contains('ALEXANDRE '),
321
364
  roster_df.Name.str.replace('ALEXANDRE ', 'ALEX '),
322
365
  roster_df['Name'])
323
-
366
+
324
367
  roster_df['Name'] = np.where(roster_df['Name'].str.contains('ALEXANDER '),
325
368
  roster_df.Name.str.replace('ALEXANDER ', 'ALEX '),
326
369
  roster_df['Name'])
327
-
370
+
328
371
  roster_df['Name'] = np.where(roster_df['Name'].str.contains('CHRISTOPHER '),
329
372
  roster_df.Name.str.replace('CHRISTOPHER ', 'CHRIS '),
330
373
  roster_df['Name'])
331
-
374
+
332
375
  # List of names and fixed from Evolving Hockey Scraper.
333
-
376
+
334
377
  roster_df = roster_df.assign(Name =
335
378
  (np.where(roster_df['Name']== "ANDREI KASTSITSYN" , "ANDREI KOSTITSYN",
336
379
  (np.where(roster_df['Name']== "AJ GREER" , "A.J. GREER",
@@ -434,7 +477,7 @@ def scrape_html_roster(season, game_id):
434
477
  roster_df['Name']))))))))))))))))))))))))))))))))))))))))))))))))))))))
435
478
  )))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))
436
479
  ))))))))))
437
-
480
+
438
481
  roster_df['Name'] = (np.where(roster_df['Name']== "RODNEY PELLEY" , "ROD PELLEY",
439
482
  (np.where(roster_df['Name']== "SIARHEI KASTSITSYN" , "SERGEI KOSTITSYN",
440
483
  (np.where(roster_df['Name']== "SIMEON VARLAMOV" , "SEMYON VARLAMOV",
@@ -475,7 +518,7 @@ def scrape_html_roster(season, game_id):
475
518
  (np.where(roster_df['Name']== "MATTIAS JANMARK-NYLEN" , "MATTIAS JANMARK",
476
519
  (np.where(roster_df['Name']== "JOSH DUNNE" , "JOSHUA DUNNE",roster_df['Name'])))))))))))))))))))))))))))))))))))))))))))
477
520
  )))))))))))))))))))))))))))))))))))
478
-
521
+
479
522
  roster_df['Name'] = np.where((roster_df['Name']=="SEBASTIAN AHO") & (roster_df['Pos']=='D'), 'SEBASTIAN AHO SWE', roster_df['Name'])
480
523
  roster_df['Name'] = np.where((roster_df['Name']=="ELIAS PETTERSSON") & (roster_df['Pos']=='D'), 'ELIAS PETTERSSON(D)', roster_df['Name'])
481
524
  roster_df['Name'] = np.where((roster_df['Name']=="COLIN WHITE") & (roster_df['Pos']=='D'), 'COLIN WHITE CAN', roster_df['Name'])
@@ -504,454 +547,374 @@ def scrape_html_roster(season, game_id):
504
547
  (np.where(roster_df['Name']== "EMIL LILLEBERG" , "EMIL MARTINSEN LILLEBERG",
505
548
  (np.where(roster_df['Name']== "CAMERON ATKINSON" , "CAM ATKINSON",
506
549
  (np.where(roster_df['Name']== "JURAJ SLAFKOVSKA" , "JURAJ SLAFKOVSKY",
507
- roster_df['Name']))))))))))))))))))))))))))))))))))
550
+ (np.where(roster_df['Name']== "MARTIN FEHARVARY" , "MARTIN FEHERVARY",
551
+ roster_df['Name']))))))))))))))))))))))))))))))))))))
508
552
 
509
553
  roster_df['Name'] = roster_df['Name'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8').str.upper()
510
554
 
511
555
  roster_df['Name'] = np.where(roster_df['Name']== "JURAJ SLAFKOVSKA" , "JURAJ SLAFKOVSKY", roster_df['Name']) # Need to do this after normalization, only then he becomes Slafkovska?
556
+ roster_df['Name'] = np.where(roster_df['Name']== "JOHN (JACK) ROSLOVIC" , "JACK ROSLOVIC", roster_df['Name'])
557
+ roster_df['Name'] = np.where(roster_df['Name']== "ANTHONY-JOHN (AJ) GREER" , "A.J. GREER", roster_df['Name'])
558
+
559
+ roster_df['Name'] = np.where(roster_df['Name']== "MARTIN FEHARVARY" , "MARTIN FEHERVARY", roster_df['Name'])
560
+
561
+ roster_df['Name'] = np.where(roster_df['Name']== "MATAJ BLAMEL" , "MATAJ BLAMEL", roster_df['Name'])
562
+
563
+ roster_df['Name'] = roster_df['Name'].str.replace(' ', ' ')
512
564
 
513
565
  return roster_df
514
566
 
515
- def scrape_html_shifts(season, game_id):
516
-
567
+ def scrape_html_shifts(season, game_id, live = True):
568
+
569
+ goalie_names = ['AARON DELL',
570
+ 'ADAM HUSKA',
571
+ 'ADAM WERNER',
572
+ 'ADAM WILCOX',
573
+ 'ADIN HILL',
574
+ 'AKIRA SCHMID',
575
+ 'AL MONTOYA',
576
+ 'ALEKSEI KOLOSOV',
577
+ 'ALES STEZKA',
578
+ 'ALEX AULD',
579
+ 'ALEX LYON',
580
+ 'ALEX NEDELJKOVIC',
581
+ 'ALEX PECHURSKI',
582
+ 'ALEX SALAK',
583
+ 'ALEX STALOCK',
584
+ 'ALEXANDAR GEORGIEV',
585
+ 'ALEXEI MELNICHUK',
586
+ 'ALLEN YORK',
587
+ 'ANDERS LINDBACK',
588
+ 'ANDERS NILSSON',
589
+ 'ANDREI VASILEVSKIY',
590
+ 'ANDREW HAMMOND',
591
+ 'ANDREW RAYCROFT',
592
+ 'ANDREY MAKAROV',
593
+ 'ANTERO NIITTYMAKI',
594
+ 'ANTHONY STOLARZ',
595
+ 'ANTOINE BIBEAU',
596
+ 'ANTON FORSBERG',
597
+ 'ANTON KHUDOBIN',
598
+ 'ANTTI NIEMI',
599
+ 'ANTTI RAANTA',
600
+ 'ARTURS SILOVS',
601
+ 'ARTYOM ZAGIDULIN',
602
+ 'ARVID SODERBLOM',
603
+ 'BEN BISHOP',
604
+ 'BEN SCRIVENS',
605
+ 'BRAD THIESSEN',
606
+ 'BRADEN HOLTBY',
607
+ 'BRANDON HALVERSON',
608
+ 'BRENT JOHNSON',
609
+ 'BRENT KRAHN',
610
+ 'BRIAN BOUCHER',
611
+ 'BRIAN ELLIOTT',
612
+ 'BRIAN FOSTER',
613
+ 'CAL PETERSEN',
614
+ 'CALVIN HEETER',
615
+ 'CALVIN PETERSEN',
616
+ 'CALVIN PICKARD',
617
+ 'CAM TALBOT',
618
+ 'CAM WARD',
619
+ 'CAREY PRICE',
620
+ 'CARTER HART',
621
+ 'CARTER HUTTON',
622
+ 'CASEY DESMITH',
623
+ 'CAYDEN PRIMEAU',
624
+ 'CEDRICK DESJARDINS',
625
+ 'CHAD JOHNSON',
626
+ 'CHARLIE LINDGREN',
627
+ 'CHRIS BECKFORD-TSEU',
628
+ 'CHRIS DRIEDGER',
629
+ 'CHRIS GIBSON',
630
+ 'CHRIS HOLT',
631
+ 'CHRIS MASON',
632
+ 'CHRIS OSGOOD',
633
+ 'COLLIN DELIA',
634
+ 'CONNOR HELLEBUYCK',
635
+ 'CONNOR INGRAM',
636
+ 'CONNOR KNAPP',
637
+ 'COREY CRAWFORD',
638
+ 'CORY SCHNEIDER',
639
+ 'CRAIG ANDERSON',
640
+ 'CRISTOBAL HUET',
641
+ 'CRISTOPHER NILSTORP',
642
+ 'CURTIS JOSEPH',
643
+ 'CURTIS MCELHINNEY',
644
+ 'CURTIS SANFORD',
645
+ 'DAN CLOUTIER',
646
+ 'DAN ELLIS',
647
+ 'DAN VLADAR',
648
+ 'DANIEL LACOSTA',
649
+ 'DANIEL TAYLOR',
650
+ 'DANIIL TARASOV',
651
+ 'DANY SABOURIN',
652
+ 'DARCY KUEMPER',
653
+ 'DAVID AEBISCHER',
654
+ 'DAVID AYRES',
655
+ 'DAVID LENEVEU',
656
+ 'DAVID RITTICH',
657
+ 'DENNIS HILDEBY',
658
+ 'DEVAN DUBNYK',
659
+ 'DEVIN COOLEY',
660
+ 'DEVON LEVI',
661
+ 'DIMITRI PATZOLD',
662
+ 'DOMINIK HASEK',
663
+ 'DREW COMMESSO',
664
+ 'DREW MACINTYRE',
665
+ 'DUSTIN TOKARSKI',
666
+ 'DUSTIN WOLF',
667
+ 'DWAYNE ROLOSON',
668
+ 'DYLAN FERGUSON',
669
+ 'DYLAN WELLS',
670
+ 'EDDIE LACK',
671
+ 'EDWARD PASQUALE',
672
+ 'EETU MAKINIEMI',
673
+ 'ELVIS MERZLIKINS',
674
+ 'ERIC COMRIE',
675
+ 'ERIK ERSBERG',
676
+ 'ERIK KALLGREN',
677
+ 'ERIK PORTILLO',
678
+ 'EVGENI NABOKOV',
679
+ 'FELIX SANDSTROM',
680
+ 'FILIP GUSTAVSSON',
681
+ 'FREDERIK ANDERSEN',
682
+ 'FREDRIK NORRENA',
683
+ 'GARRET SPARKS',
684
+ 'GEORGI ROMANOV',
685
+ 'GILLES SENN',
686
+ 'HANNU TOIVONEN',
687
+ 'HARRI SATERI',
688
+ 'HENRIK KARLSSON',
689
+ 'HENRIK LUNDQVIST',
690
+ 'HUGO ALNEFELT',
691
+ 'HUNTER MISKA',
692
+ 'HUNTER SHEPARD',
693
+ 'IGOR SHESTERKIN',
694
+ 'IIRO TARKKI',
695
+ 'ILYA BRYZGALOV',
696
+ 'ILYA SAMSONOV',
697
+ 'ILYA SOROKIN',
698
+ 'IVAN FEDOTOV',
699
+ 'IVAN PROSVETOV',
700
+ 'J-F BERUBE',
701
+ 'JACK CAMPBELL',
702
+ 'JACK LAFONTAINE',
703
+ 'JACOB MARKSTROM',
704
+ 'JAKE ALLEN',
705
+ 'JAKE OETTINGER',
706
+ 'JAKUB DOBES',
707
+ 'JAKUB SKAREK',
708
+ 'JAMES REIMER',
709
+ 'JARED COREAU',
710
+ 'JAROSLAV HALAK',
711
+ 'JASON KASDORF',
712
+ 'JASON LABARBERA',
713
+ 'JAXSON STAUBER',
714
+ 'JEAN-SEBASTIEN AUBIN',
715
+ 'JEAN-SEBASTIEN GIGUERE',
716
+ 'JEFF DESLAURIERS',
717
+ 'JEFF FRAZEE',
718
+ 'JEFF GLASS',
719
+ 'JEFF ZATKOFF',
720
+ 'JEREMY DUCHESNE',
721
+ 'JEREMY SMITH',
722
+ 'JEREMY SWAYMAN',
723
+ 'JESPER WALLSTEDT',
724
+ 'JET GREAVES',
725
+ 'JHONAS ENROTH',
726
+ 'JIMMY HOWARD',
727
+ 'JIRI PATERA',
728
+ 'JOACIM ERIKSSON',
729
+ 'JOCELYN THIBAULT',
730
+ 'JOEL BLOMQVIST',
731
+ 'JOEL HOFER',
732
+ 'JOEY DACCORD',
733
+ 'JOEY MACDONALD',
734
+ 'JOHAN BACKLUND',
735
+ 'JOHAN HEDBERG',
736
+ 'JOHAN HOLMQVIST',
737
+ 'JOHN CURRY',
738
+ 'JOHN GIBSON',
739
+ 'JOHN GRAHAME',
740
+ 'JON GILLIES',
741
+ 'JONAS GUSTAVSSON',
742
+ 'JONAS HILLER',
743
+ 'JONAS JOHANSSON',
744
+ 'JONATHAN BERNIER',
745
+ 'JONATHAN QUICK',
746
+ 'JONI ORTIO',
747
+ 'JOONAS KORPISALO',
748
+ 'JORDAN BINNINGTON',
749
+ 'JOSE THEODORE',
750
+ 'JOSEF KORENAR',
751
+ 'JOSEPH WOLL',
752
+ 'JOSH HARDING',
753
+ 'JOSH TORDJMAN',
754
+ 'JUSSI RYNNAS',
755
+ 'JUSTIN PETERS',
756
+ 'JUSTIN POGGE',
757
+ 'JUSTUS ANNUNEN',
758
+ 'JUUSE SAROS',
759
+ 'KAAPO KAHKONEN',
760
+ 'KADEN FULCHER',
761
+ 'KAREL VEJMELKA',
762
+ 'KARI LEHTONEN',
763
+ 'KARRI RAMO',
764
+ 'KASIMIR KASKISUO',
765
+ 'KEITH KINKAID',
766
+ 'KEN APPLEBY',
767
+ 'KENNETH APPLEBY',
768
+ 'KENT SIMPSON',
769
+ 'KEVIN BOYLE',
770
+ 'KEVIN LANKINEN',
771
+ 'KEVIN MANDOLESE',
772
+ 'KEVIN POULIN',
773
+ 'KEVIN WEEKES',
774
+ 'KRISTERS GUDLEVSKIS',
775
+ 'LANDON BOW',
776
+ 'LAURENT BROSSOIT',
777
+ 'LEEVI MERILAINEN',
778
+ 'LELAND IRVING',
779
+ 'LINUS ULLMARK',
780
+ 'LOGAN THOMPSON',
781
+ 'LOUIS DOMINGUE',
782
+ 'LUKAS DOSTAL',
783
+ 'MACKENZIE BLACKWOOD',
784
+ 'MACKENZIE SKAPSKI',
785
+ 'MADS SOGAARD',
786
+ 'MAGNUS CHRONA',
787
+ 'MAGNUS HELLBERG',
788
+ 'MALCOLM SUBBAN',
789
+ 'MANNY FERNANDEZ',
790
+ 'MANNY LEGACE',
791
+ 'MARC DENIS',
792
+ 'MARC-ANDRE FLEURY',
793
+ 'MARCUS HOGBERG',
794
+ 'MAREK LANGHAMER',
795
+ 'MAREK MAZANEC',
796
+ 'MAREK SCHWARZ',
797
+ 'MARK DEKANICH',
798
+ 'MARK VISENTIN',
799
+ 'MARTIN BIRON',
800
+ 'MARTIN BRODEUR',
801
+ 'MARTIN GERBER',
802
+ 'MARTIN JONES',
803
+ 'MARTY TURCO',
804
+ 'MATHIEU GARON',
805
+ 'MATISS KIVLENIEKS',
806
+ 'MATT CLIMIE',
807
+ 'MATT HACKETT',
808
+ 'MATT KEETLEY',
809
+ 'MATT MURRAY',
810
+ 'MATT TOMKINS',
811
+ 'MATT VILLALTA',
812
+ 'MATT ZABA',
813
+ "MATTHEW O'CONNOR",
814
+ 'MAXIME LAGACE',
815
+ 'MICHAEL DIPIETRO',
816
+ 'MICHAEL HOUSER',
817
+ 'MICHAEL HUTCHINSON',
818
+ 'MICHAEL LEIGHTON',
819
+ 'MICHAEL MCNIVEN',
820
+ 'MICHAL NEUVIRTH',
821
+ 'MIIKKA KIPRUSOFF',
822
+ 'MIKAEL TELLQVIST',
823
+ 'MIKE BRODEUR',
824
+ 'MIKE CONDON',
825
+ 'MIKE MCKENNA',
826
+ 'MIKE MURPHY',
827
+ 'MIKE SMITH',
828
+ 'MIKKO KOSKINEN',
829
+ 'NATHAN LAWSON',
830
+ 'NATHAN LIEUWEN',
831
+ 'NICO DAWS',
832
+ 'NIKITA TOLOPILO',
833
+ 'NIKKE KOKKO',
834
+ 'NIKLAS BACKSTROM',
835
+ 'NIKLAS SVEDBERG',
836
+ 'NIKLAS TREUTLE',
837
+ 'NIKOLAI KHABIBULIN',
838
+ 'OLIE KOLZIG',
839
+ 'OLIVIER RODRIGUE',
840
+ 'OLLE ERIKSSON EK',
841
+ 'ONDREJ PAVELEC',
842
+ 'OSCAR DANSK',
843
+ 'PASCAL LECLAIRE',
844
+ 'PATRICK LALIME',
845
+ 'PAVEL FRANCOUZ',
846
+ 'PEKKA RINNE',
847
+ 'PETER BUDAJ',
848
+ 'PETER MANNINO',
849
+ 'PETR MRAZEK',
850
+ 'PHEONIX COPLEY',
851
+ 'PHILIPP GRUBAUER',
852
+ 'PYOTR KOCHETKOV',
853
+ 'RAY EMERY',
854
+ 'RETO BERRA',
855
+ 'RICHARD BACHMAN',
856
+ 'RICK DIPIETRO',
857
+ 'RIKU HELENIUS',
858
+ 'ROB ZEPP',
859
+ 'ROBERTO LUONGO',
860
+ 'ROBIN LEHNER',
861
+ 'ROMAN WILL',
862
+ 'RYAN MILLER',
863
+ 'SAM MONTEMBEAULT',
864
+ 'SAMUEL MONTEMBEAULT',
865
+ 'SAMI AITTOKALLIO',
866
+ 'SAMUEL ERSSON',
867
+ 'SCOTT CLEMMENSEN',
868
+ 'SCOTT DARLING',
869
+ 'SCOTT FOSTER',
870
+ 'SCOTT WEDGEWOOD',
871
+ 'SEBASTIAN COSSA',
872
+ 'SEBASTIEN CARON',
873
+ 'SEMYON VARLAMOV',
874
+ 'SERGEI BOBROVSKY',
875
+ 'SPENCER KNIGHT',
876
+ 'SPENCER MARTIN',
877
+ 'STEVE MASON',
878
+ 'STEVE VALIQUETTE',
879
+ 'STUART SKINNER',
880
+ 'THATCHER DEMKO',
881
+ 'THOMAS GREISS',
882
+ 'THOMAS HODGES',
883
+ 'TIM THOMAS',
884
+ 'TIMO PIELMEIER',
885
+ 'TOBIAS STEPHAN',
886
+ 'TOM MCCOLLUM',
887
+ 'TOMAS VOKOUN',
888
+ 'TRENT MINER',
889
+ 'TRISTAN JARRY',
890
+ 'TRISTAN LENNOX',
891
+ 'TROY GROSENICK',
892
+ 'TUUKKA RASK',
893
+ 'TY CONKLIN',
894
+ 'TYLER BUNZ',
895
+ 'TYLER WEIMAN',
896
+ 'UKKO-PEKKA LUUKKONEN',
897
+ 'VEINI VEHVILAINEN',
898
+ 'VESA TOSKALA',
899
+ 'VICTOR OSTMAN',
900
+ 'VIKTOR FASTH',
901
+ 'VILLE HUSSO',
902
+ 'VITEK VANECEK',
903
+ 'WADE DUBIELEWICZ',
904
+ 'YANIV PERETS',
905
+ 'YANN DANIS',
906
+ 'YAROSLAV ASKAROV',
907
+ 'ZACH FUCALE',
908
+ 'ZACH SAWCHENKO',
909
+ 'ZANE MCINTYRE']
910
+
517
911
  url = 'http://www.nhl.com/scores/htmlreports/' + season + '/TH0' + game_id + '.HTM'
518
912
  page = (requests.get(url))
519
- soup = BeautifulSoup(page.content.decode('ISO-8859-1'), 'lxml', multi_valued_attributes = None)
520
- found = soup.find_all('td', {'class':['playerHeading + border', 'lborder + bborder']})
913
+ home_soup = BeautifulSoup(page.content)
914
+ found = home_soup.find_all('td', {'class':['playerHeading + border', 'lborder + bborder']})
521
915
  if len(found)==0:
522
916
  raise IndexError('This game has no shift data.')
523
- thisteam = soup.find('td', {'align':'center', 'class':'teamHeading + border'}).get_text()
524
-
525
- goalie_names = ['AARON DELL',
526
- 'AARON SOROCHAN',
527
- 'ADAM HUSKA',
528
- 'ADAM WERNER',
529
- 'ADAM WILCOX',
530
- 'ADIN HILL',
531
- 'AKIRA SCHMID',
532
- 'AL MONTOYA',
533
- 'ALEKSEI KOLOSOV',
534
- 'ALEX AULD',
535
- "ALEX D'ORIO",
536
- 'ALEX LYON',
537
- 'ALEX NEDELJKOVIC',
538
- 'ALEX PECHURSKI',
539
- 'ALEX SALAK',
540
- 'ALEX STALOCK',
541
- 'ALEXANDAR GEORGIEV',
542
- 'ALEXEI MELNICHUK',
543
- 'ALLEN YORK',
544
- 'ANDERS LINDBACK',
545
- 'ANDERS NILSSON',
546
- 'ANDREI VASILEVSKIY',
547
- 'ANDREW HAMMOND',
548
- 'ANDREW RAYCROFT',
549
- 'ANDREY MAKAROV',
550
- 'ANGUS REDMOND',
551
- 'ANTERO NIITTYMAKI',
552
- 'ANTHONY STOLARZ',
553
- 'ANTOINE BIBEAU',
554
- 'ANTON FORSBERG',
555
- 'ANTON KHUDOBIN',
556
- 'ANTTI NIEMI',
557
- 'ANTTI RAANTA',
558
- 'ARTURS SILOVS',
559
- 'ARTYOM ZAGIDULIN',
560
- 'ARVID SODERBLOM',
561
- 'BEN BISHOP',
562
- 'BEN SCRIVENS',
563
- 'BEN WEXLER',
564
- 'BRAD THIESSEN',
565
- 'BRADEN HOLTBY',
566
- 'BRANDON HALVERSON',
567
- 'BRENT JOHNSON',
568
- 'BRENT KRAHN',
569
- 'BRETT LEONHARDT',
570
- 'BRIAN BOUCHER',
571
- 'BRIAN ELLIOTT',
572
- 'BRIAN FOSTER',
573
- 'BRYAN PITTON',
574
- 'CAL PETERSEN',
575
- 'CALVIN HEETER',
576
- 'CALVIN PETERSEN',
577
- 'CALVIN PICKARD',
578
- 'CAM TALBOT',
579
- 'CAM WARD',
580
- 'CAMERON JOHNSON',
581
- 'CAREY PRICE',
582
- 'CARTER HART',
583
- 'CARTER HUTTON',
584
- 'CASEY DESMITH',
585
- 'CAYDEN PRIMEAU',
586
- 'CEDRICK DESJARDINS',
587
- 'CHAD JOHNSON',
588
- 'CHARLIE LINDGREN',
589
- 'CHET PICKARD',
590
- 'CHRIS BECKFORD-TSEU',
591
- 'CHRIS DRIEDGER',
592
- 'CHRIS GIBSON',
593
- 'CHRIS HOLT',
594
- 'CHRIS MASON',
595
- 'CHRIS OSGOOD',
596
- 'COLE KEHLER',
597
- 'COLLIN DELIA',
598
- 'CONNOR HELLEBUYCK',
599
- 'CONNOR INGRAM',
600
- 'CONNOR KNAPP',
601
- 'COREY CRAWFORD',
602
- 'CORY SCHNEIDER',
603
- 'CRAIG ANDERSON',
604
- 'CRISTOBAL HUET',
605
- 'CRISTOPHER NILSTORP',
606
- 'CURTIS JOSEPH',
607
- 'CURTIS MCELHINNEY',
608
- 'CURTIS SANFORD',
609
- 'DAN CLOUTIER',
610
- 'DAN ELLIS',
611
- 'DAN TURPLE',
612
- 'DAN VLADAR',
613
- 'DANIEL ALTSHULLER',
614
- 'DANIEL LACOSTA',
615
- 'DANIEL LARSSON',
616
- 'DANIEL MANZATO',
617
- 'DANIEL TAYLOR',
618
- 'DANIIL TARASOV',
619
- 'DANY SABOURIN',
620
- 'DARCY KUEMPER',
621
- 'DAREN MACHESNEY',
622
- 'DAVID AEBISCHER',
623
- 'DAVID AYRES',
624
- 'DAVID LENEVEU',
625
- 'DAVID RITTICH',
626
- 'DAVID SHANTZ',
627
- 'DENNIS ENDRAS',
628
- 'DENNIS HILDEBY',
629
- 'DERECK BARIBEAU',
630
- 'DEVAN DUBNYK',
631
- 'DEVIN COOLEY',
632
- 'DEVON LEVI',
633
- 'DIMITRI PATZOLD',
634
- 'DOMINIK HASEK',
635
- 'DREW COMMESSO',
636
- 'DREW MACINTYRE',
637
- 'DUSTIN BUTLER',
638
- 'DUSTIN TOKARSKI',
639
- 'DUSTIN WOLF',
640
- 'DUSTYN ZENNER',
641
- 'DWAYNE ROLOSON',
642
- 'DYLAN FERGUSON',
643
- 'DYLAN WELLS',
644
- 'EAMON MCADAM',
645
- 'EDDIE LACK',
646
- 'EDWARD PASQUALE',
647
- 'EETU MAKINIEMI',
648
- 'ELVIS MERZLIKINS',
649
- 'EMIL LARMI',
650
- 'ERIC COMRIE',
651
- 'ERIC HARTZELL',
652
- 'ERIC SEMBORSKI',
653
- 'ERIK ERSBERG',
654
- 'ERIK KALLGREN',
655
- 'ERIK PORTILLO',
656
- 'EVAN CORMIER',
657
- 'EVAN FITZPATRICK',
658
- 'EVGENI NABOKOV',
659
- 'FELIX SANDSTROM',
660
- 'FILIP GUSTAVSSON',
661
- 'FRED BRATHWAITE',
662
- 'FREDERIC CASSIVI',
663
- 'FREDERIK ANDERSEN',
664
- 'FREDRIK NORRENA',
665
- 'GARRET SPARKS',
666
- 'GAVIN MCHALE',
667
- 'GEORGI ROMANOV',
668
- 'GERALD COLEMAN',
669
- 'GILLES SENN',
670
- 'HANNU TOIVONEN',
671
- 'HARRI SATERI',
672
- 'HENRIK KARLSSON',
673
- 'HENRIK LUNDQVIST',
674
- 'HUGO ALNEFELT',
675
- 'HUNTER MISKA',
676
- 'HUNTER SHEPARD',
677
- 'IGOR BOBKOV',
678
- 'IGOR SHESTERKIN',
679
- 'IIRO TARKKI',
680
- 'ILYA BRYZGALOV',
681
- 'ILYA SAMSONOV',
682
- 'ILYA SOROKIN',
683
- 'IVAN FEDOTOV',
684
- 'IVAN PROSVETOV',
685
- 'J-F BERUBE',
686
- 'J.F. BERUBE',
687
- 'JACK CAMPBELL',
688
- 'JACK LAFONTAINE',
689
- 'JACOB MARKSTROM',
690
- 'JAKE ALLEN',
691
- 'JAKE OETTINGER',
692
- 'JAMES REIMER',
693
- 'JARED COREAU',
694
- 'JAROSLAV HALAK',
695
- 'JASON BACASHIHUA',
696
- 'JASON KASDORF',
697
- 'JASON LABARBERA',
698
- 'JASON MISSIAEN',
699
- 'JAXSON STAUBER',
700
- 'JEAN-PHILIPPE LEVASSEUR',
701
- 'JEAN-SEBASTIEN AUBIN',
702
- 'JEAN-SEBASTIEN GIGUERE',
703
- 'JEFF DESLAURIERS',
704
- 'JEFF FRAZEE',
705
- 'JEFF GLASS',
706
- 'JEFF TYNI',
707
- 'JEFF ZATKOFF',
708
- 'JEREMY DUCHESNE',
709
- 'JEREMY SMITH',
710
- 'JEREMY SWAYMAN',
711
- 'JESPER WALLSTEDT',
712
- 'JET GREAVES',
713
- 'JHONAS ENROTH',
714
- 'JIMMY HOWARD',
715
- 'JIRI PATERA',
716
- 'JOACIM ERIKSSON',
717
- 'JOCELYN THIBAULT',
718
- 'JOE CANNATA',
719
- 'JOE FALLON',
720
- 'JOEL BLOMQVIST',
721
- 'JOEL HOFER',
722
- 'JOEL MARTIN',
723
- 'JOEY DACCORD',
724
- 'JOEY MACDONALD',
725
- 'JOHAN BACKLUND',
726
- 'JOHAN GUSTAFSSON',
727
- 'JOHAN HEDBERG',
728
- 'JOHAN HOLMQVIST',
729
- 'JOHN CURRY',
730
- 'JOHN GIBSON',
731
- 'JOHN GRAHAME',
732
- 'JOHN MUSE',
733
- 'JON GILLIES',
734
- 'JON-PAUL ANDERSON',
735
- 'JONAS GUSTAVSSON',
736
- 'JONAS HILLER',
737
- 'JONAS JOHANSSON',
738
- 'JONATHAN BERNIER',
739
- 'JONATHAN BOUTIN',
740
- 'JONATHAN QUICK',
741
- 'JONI ORTIO',
742
- 'JOONAS KORPISALO',
743
- 'JORDAN BINNINGTON',
744
- 'JORDAN PEARCE',
745
- 'JORDAN SIGALET',
746
- 'JORDAN WHITE',
747
- 'JORGE ALVES',
748
- 'JOSE THEODORE',
749
- 'JOSEF KORENAR',
750
- 'JOSEPH WOLL',
751
- 'JOSH HARDING',
752
- 'JOSH TORDJMAN',
753
- 'JUSSI RYNNAS',
754
- 'JUSTIN KOWALKOSKI',
755
- 'JUSTIN PETERS',
756
- 'JUSTIN POGGE',
757
- 'JUSTUS ANNUNEN',
758
- 'JUUSE SAROS',
759
- 'JUUSO RIKSMAN',
760
- 'KAAPO KAHKONEN',
761
- 'KADEN FULCHER',
762
- 'KAREL VEJMELKA',
763
- 'KARI LEHTONEN',
764
- 'KARRI RAMO',
765
- 'KASIMIR KASKISUO',
766
- 'KEITH KINKAID',
767
- 'KEN APPLEBY',
768
- 'KENNETH APPLEBY',
769
- 'KENT SIMPSON',
770
- 'KEVIN BOYLE',
771
- 'KEVIN LANKINEN',
772
- 'KEVIN MANDOLESE',
773
- 'KEVIN NASTIUK',
774
- 'KEVIN POULIN',
775
- 'KEVIN WEEKES',
776
- 'KRISTERS GUDLEVSKIS',
777
- 'KURTIS MUCHA',
778
- 'LANDON BOW',
779
- 'LARS JOHANSSON',
780
- 'LAURENT BROSSOIT',
781
- 'LEEVI MERILAINEN',
782
- 'LELAND IRVING',
783
- 'LINUS ULLMARK',
784
- 'LOGAN THOMPSON',
785
- 'LOUIS DOMINGUE',
786
- 'LUKAS DOSTAL',
787
- 'MACKENZIE BLACKWOOD',
788
- 'MACKENZIE SKAPSKI',
789
- 'MADS SOGAARD',
790
- 'MAGNUS CHRONA',
791
- 'MAGNUS HELLBERG',
792
- 'MALCOLM SUBBAN',
793
- 'MANNY FERNANDEZ',
794
- 'MANNY LEGACE',
795
- 'MARC CHEVERIE',
796
- 'MARC DENIS',
797
- 'MARC-ANDRE FLEURY',
798
- 'MARCUS HOGBERG',
799
- 'MAREK LANGHAMER',
800
- 'MAREK MAZANEC',
801
- 'MAREK SCHWARZ',
802
- 'MARK DEKANICH',
803
- 'MARK VISENTIN',
804
- 'MARTIN BIRON',
805
- 'MARTIN BRODEUR',
806
- 'MARTIN GERBER',
807
- 'MARTIN JONES',
808
- 'MARTY TURCO',
809
- 'MAT ROBSON',
810
- 'MATHIEU CORBEIL',
811
- 'MATHIEU GARON',
812
- 'MATISS KIVLENIEKS',
813
- 'MATT CLIMIE',
814
- 'MATT DALTON',
815
- 'MATT HACKETT',
816
- 'MATT KEETLEY',
817
- 'MATT MURRAY',
818
- 'MATT TOMKINS',
819
- 'MATT VILLALTA',
820
- 'MATT ZABA',
821
- 'MATTHEW HEWITT',
822
- "MATTHEW O'CONNOR",
823
- 'MAXIME LAGACE',
824
- 'MICHAEL DIPIETRO',
825
- 'MICHAEL GARTEIG',
826
- 'MICHAEL HOUSER',
827
- 'MICHAEL HUTCHINSON',
828
- 'MICHAEL LEE',
829
- 'MICHAEL LEIGHTON',
830
- 'MICHAEL MCNIVEN',
831
- 'MICHAEL MOLE',
832
- 'MICHAEL MORRISON',
833
- 'MICHAEL WALL',
834
- 'MICHAL NEUVIRTH',
835
- 'MIIKA WIIKMAN',
836
- 'MIIKKA KIPRUSOFF',
837
- 'MIKAEL TELLQVIST',
838
- 'MIKE BRODEUR',
839
- 'MIKE CONDON',
840
- 'MIKE MCKENNA',
841
- 'MIKE MURPHY',
842
- 'MIKE SMITH',
843
- 'MIKKO KOSKINEN',
844
- 'MIROSLAV SVOBODA',
845
- 'NATHAN DEOBALD',
846
- 'NATHAN LAWSON',
847
- 'NATHAN LIEUWEN',
848
- 'NATHAN SCHOENFELD',
849
- 'NICK ELLIS',
850
- 'NICO DAWS',
851
- 'NIKKE KOKKO',
852
- 'NIKLAS BACKSTROM',
853
- 'NIKLAS LUNDSTROM',
854
- 'NIKLAS SVEDBERG',
855
- 'NIKLAS TREUTLE',
856
- 'NIKOLAI KHABIBULIN',
857
- 'NOLAN SCHAEFER',
858
- 'OLIE KOLZIG',
859
- 'OLLE ERIKSSON EK',
860
- 'ONDREJ PAVELEC',
861
- 'OSCAR DANSK',
862
- 'PASCAL LECLAIRE',
863
- 'PAT CONACHER',
864
- 'PATRICK KILLEEN',
865
- 'PATRICK LALIME',
866
- 'PAUL DEUTSCH',
867
- 'PAVEL FRANCOUZ',
868
- 'PEKKA RINNE',
869
- 'PETER BUDAJ',
870
- 'PETER MANNINO',
871
- 'PETR MRAZEK',
872
- 'PHEONIX COPLEY',
873
- 'PHILIPP GRUBAUER',
874
- 'PHILIPPE DESROSIERS',
875
- 'PYOTR KOCHETKOV',
876
- 'RAY EMERY',
877
- 'RETO BERRA',
878
- 'RICHARD BACHMAN',
879
- 'RICK DIPIETRO',
880
- 'RIKU HELENIUS',
881
- 'ROB LAURIE',
882
- 'ROB ZEPP',
883
- 'ROBB TALLAS',
884
- 'ROBBIE TALLAS',
885
- 'ROBERT MAYER',
886
- 'ROBERTO LUONGO',
887
- 'ROBIN LEHNER',
888
- 'ROMAN WILL',
889
- 'RYAN LOWE',
890
- 'RYAN MILLER',
891
- 'RYAN MUNCE',
892
- 'RYAN VINZ',
893
- 'SAM BRITTAIN',
894
- 'SAM MONTEMBEAULT',
895
- 'SAMI AITTOKALLIO',
896
- 'SAMUEL ERSSON',
897
- 'SAMUEL MONTEMBEAULT',
898
- 'SAM MONTEMBEAULT',
899
- 'SCOTT CLEMMENSEN',
900
- 'SCOTT DARLING',
901
- 'SCOTT FOSTER',
902
- 'SCOTT MUNROE',
903
- 'SCOTT STAJCER',
904
- 'SCOTT WEDGEWOOD',
905
- 'SEBASTIEN CARON',
906
- 'SEMYON VARLAMOV',
907
- 'SERGEI BOBROVSKY',
908
- 'SHAWN HUNWICK',
909
- 'SPENCER KNIGHT',
910
- 'SPENCER MARTIN',
911
- 'STEFANOS LEKKAS',
912
- 'STEVE MASON',
913
- 'STEVE MICHALEK',
914
- 'STEVE VALIQUETTE',
915
- 'STUART SKINNER',
916
- 'THATCHER DEMKO',
917
- 'THOMAS FENTON',
918
- 'THOMAS GREISS',
919
- 'THOMAS HODGES',
920
- 'TIM THOMAS',
921
- 'TIMO PIELMEIER',
922
- 'TIMOTHY JR. THOMAS',
923
- 'TOBIAS STEPHAN',
924
- 'TODD FORD',
925
- 'TOM MCCOLLUM',
926
- 'TOMAS POPPERLE',
927
- 'TOMAS VOKOUN',
928
- 'TORRIE JUNG',
929
- 'TRENT MINER',
930
- 'TRISTAN JARRY',
931
- 'TROY GROSENICK',
932
- 'TUUKKA RASK',
933
- 'TY CONKLIN',
934
- 'TYLER BUNZ',
935
- 'TYLER PLANTE',
936
- 'TYLER STEWART',
937
- 'TYLER WEIMAN',
938
- 'TYSON SEXSMITH',
939
- 'UKKO-PEKKA LUUKKONEN',
940
- 'VEINI VEHVILAINEN',
941
- 'VESA TOSKALA',
942
- 'VIKTOR FASTH',
943
- 'VILLE HUSSO',
944
- 'VITEK VANECEK',
945
- 'WADE DUBIELEWICZ',
946
- 'YANIV PERETS',
947
- 'YANN DANIS',
948
- 'YAROSLAV ASKAROV',
949
- 'ZACH FUCALE',
950
- 'ZACH SAWCHENKO',
951
- 'ZACH SIKICH',
952
- 'ZACHARY FUCALE',
953
- 'ZANE KALEMBA',
954
- 'ZANE MCINTYRE']
917
+ thisteam = home_soup.find('td', {'align':'center', 'class':'teamHeading + border'}).get_text()
955
918
 
956
919
  players = dict()
957
920
 
@@ -985,12 +948,95 @@ def scrape_html_shifts(season, game_id):
985
948
  alldf = alldf._append(df)
986
949
 
987
950
  home_shifts = alldf
951
+
952
+ home_shifts.to_csv('/Users/patrickbacon/compact_topdownhockey/home_shifts.csv', index = False)
953
+
954
+ if live == True:
955
+
956
+ home_shifts = home_shifts.assign(shift_number = home_shifts.shift_number.astype(int))
957
+ home_shifts = home_shifts.assign(number = home_shifts.number.astype(int))
958
+
959
+ found = home_soup.find_all('td', {'class':['playerHeading + border', 'bborder + lborder +']})
960
+ if len(found)==0:
961
+ raise IndexError('This game has no shift data.')
962
+ thisteam = home_soup.find('td', {'align':'center', 'class':'teamHeading + border'}).get_text()
963
+
964
+ players = dict()
965
+
966
+ for i in range(len(found)):
967
+ line = found[i].get_text()
968
+ if line == '25 PETTERSSON, ELIAS':
969
+ line = '25 PETTERSSON(D), ELIAS'
970
+ if ', ' in line:
971
+ name = line.split(',')
972
+ number = name[0].split(' ')[0].strip()
973
+ last_name = name[0].split(' ')[1].strip()
974
+ first_name = name[1].strip()
975
+ full_name = first_name + " " + last_name
976
+ players[full_name] = dict()
977
+ players[full_name]['number'] = number
978
+ players[full_name]['name'] = full_name
979
+ players[full_name]['shifts'] = []
980
+ else:
981
+ players[full_name]['shifts'].extend([line])
982
+
983
+ alldf = pd.DataFrame()
984
+
985
+ for key in players.keys():
986
+ length = length = int(len(players[key]['shifts'])/6)
987
+ df = df = pd.DataFrame(np.array((players[key]['shifts'])).reshape(length, 6)).rename(
988
+ columns = {0:'period', 1:'shifts', 2:'avg', 3:'TOI', 4:'EV Total', 5:'PP Total'})
989
+ df = df.assign(name = players[key]['name'],
990
+ number = players[key]['number'],
991
+ team = thisteam,
992
+ venue = "home")
993
+ alldf = alldf._append(df)
994
+
995
+ home_extra_shifts = alldf
996
+
997
+ shifts_needing_to_be_added = home_extra_shifts[home_extra_shifts.shifts=='0']
998
+
999
+ def subtract_from_twenty_minutes(time_string):
1000
+ # Parse the input time string
1001
+ minutes, seconds = map(int, time_string.split(':'))
1002
+
1003
+ # Convert to total seconds
1004
+ input_seconds = minutes * 60 + seconds
1005
+ twenty_minutes_seconds = 20 * 60 # 1200 seconds
1006
+
1007
+ # Calculate the difference
1008
+ difference_seconds = twenty_minutes_seconds - input_seconds
1009
+
1010
+ # Convert back to MM:SS format
1011
+ result_minutes = difference_seconds // 60
1012
+ result_seconds = difference_seconds % 60
1013
+
1014
+ # Format the result
1015
+ return f"{result_minutes}:{result_seconds:02d}"
1016
+
1017
+ shifts_needing_to_be_added = shifts_needing_to_be_added.assign(shift_start = '0:00 / ' + shifts_needing_to_be_added.TOI,
1018
+ shift_end = shifts_needing_to_be_added.TOI + ' / ' + shifts_needing_to_be_added.TOI.apply(lambda x: subtract_from_twenty_minutes(x)),
1019
+ duration = shifts_needing_to_be_added.TOI)
1020
+
1021
+ shifts_needing_to_be_added = shifts_needing_to_be_added.merge(
1022
+ home_shifts.assign(shift_number = home_shifts.shift_number.astype(int)).groupby('name')['shift_number'].max().reset_index().rename(columns = {'shift_number':'prior_max_shift'})
1023
+ )
1024
+
1025
+ shifts_needing_to_be_added = shifts_needing_to_be_added.assign(shift_number = shifts_needing_to_be_added.prior_max_shift + 1)
1026
+
1027
+ shifts_needing_to_be_added = shifts_needing_to_be_added.loc[:, ['shift_number', 'period', 'shift_start', 'shift_end', 'duration', 'name', 'number', 'team', 'venue']]
1028
+
1029
+ shifts_needing_to_be_added['number'] = shifts_needing_to_be_added['number'].astype(int)
1030
+
1031
+ home_shifts = pd.concat([home_shifts, shifts_needing_to_be_added]).sort_values(by = ['number', 'period', 'shift_number'])
988
1032
 
989
1033
  url = 'http://www.nhl.com/scores/htmlreports/' + season + '/TV0' + game_id + '.HTM'
990
1034
  page = (requests.get(url))
991
- soup = BeautifulSoup(page.content.decode('ISO-8859-1'), 'lxml', multi_valued_attributes = None)
992
- found = soup.find_all('td', {'class':['playerHeading + border', 'lborder + bborder']})
993
- thisteam = soup.find('td', {'align':'center', 'class':'teamHeading + border'}).get_text()
1035
+ away_soup = BeautifulSoup(page.content)
1036
+ found = away_soup.find_all('td', {'class':['playerHeading + border', 'lborder + bborder']})
1037
+ if len(found)==0:
1038
+ raise IndexError('This game has no shift data.')
1039
+ thisteam = away_soup.find('td', {'align':'center', 'class':'teamHeading + border'}).get_text()
994
1040
 
995
1041
  players = dict()
996
1042
 
@@ -1022,12 +1068,163 @@ def scrape_html_shifts(season, game_id):
1022
1068
  team = thisteam,
1023
1069
  venue = "away")
1024
1070
  alldf = alldf._append(df)
1025
-
1071
+
1026
1072
  away_shifts = alldf
1027
-
1073
+
1074
+ away_shifts.to_csv('/Users/patrickbacon/compact_topdownhockey/away_shifts.csv', index = False)
1075
+
1076
+ if live == True:
1077
+
1078
+ away_shifts = away_shifts.assign(shift_number = away_shifts.shift_number.astype(int))
1079
+ away_shifts = away_shifts.assign(number = away_shifts.number.astype(int))
1080
+
1081
+ found = away_soup.find_all('td', {'class':['playerHeading + border', 'bborder + lborder +']})
1082
+ if len(found)==0:
1083
+ raise IndexError('This game has no shift data.')
1084
+ thisteam = away_soup.find('td', {'align':'center', 'class':'teamHeading + border'}).get_text()
1085
+
1086
+ players = dict()
1087
+
1088
+ for i in range(len(found)):
1089
+ line = found[i].get_text()
1090
+ if line == '25 PETTERSSON, ELIAS':
1091
+ line = '25 PETTERSSON(D), ELIAS'
1092
+ if ', ' in line:
1093
+ name = line.split(',')
1094
+ number = name[0].split(' ')[0].strip()
1095
+ last_name = name[0].split(' ')[1].strip()
1096
+ first_name = name[1].strip()
1097
+ full_name = first_name + " " + last_name
1098
+ players[full_name] = dict()
1099
+ players[full_name]['number'] = number
1100
+ players[full_name]['name'] = full_name
1101
+ players[full_name]['shifts'] = []
1102
+ else:
1103
+ players[full_name]['shifts'].extend([line])
1104
+
1105
+ alldf = pd.DataFrame()
1106
+
1107
+ for key in players.keys():
1108
+ length = length = int(len(players[key]['shifts'])/6)
1109
+ df = df = pd.DataFrame(np.array((players[key]['shifts'])).reshape(length, 6)).rename(
1110
+ columns = {0:'period', 1:'shifts', 2:'avg', 3:'TOI', 4:'EV Total', 5:'PP Total'})
1111
+ df = df.assign(name = players[key]['name'],
1112
+ number = players[key]['number'],
1113
+ team = thisteam,
1114
+ venue = "away")
1115
+ alldf = alldf._append(df)
1116
+
1117
+ away_extra_shifts = alldf
1118
+
1119
+ shifts_needing_to_be_added = away_extra_shifts[away_extra_shifts.shifts=='0']
1120
+
1121
+ def subtract_from_twenty_minutes(time_string):
1122
+ # Parse the input time string
1123
+ minutes, seconds = map(int, time_string.split(':'))
1124
+
1125
+ # Convert to total seconds
1126
+ input_seconds = minutes * 60 + seconds
1127
+ twenty_minutes_seconds = 20 * 60 # 1200 seconds
1128
+
1129
+ # Calculate the difference
1130
+ difference_seconds = twenty_minutes_seconds - input_seconds
1131
+
1132
+ # Convert back to MM:SS format
1133
+ result_minutes = difference_seconds // 60
1134
+ result_seconds = difference_seconds % 60
1135
+
1136
+ # Format the result
1137
+ return f"{result_minutes}:{result_seconds:02d}"
1138
+
1139
+ shifts_needing_to_be_added = shifts_needing_to_be_added.assign(shift_start = '0:00 / ' + shifts_needing_to_be_added.TOI.astype(str),
1140
+ shift_end = shifts_needing_to_be_added.TOI.astype(str) + ' / ' + shifts_needing_to_be_added.TOI.apply(lambda x: subtract_from_twenty_minutes(x)),
1141
+ duration = shifts_needing_to_be_added.TOI.astype(str))
1142
+
1143
+ shifts_needing_to_be_added = shifts_needing_to_be_added.merge(
1144
+ away_shifts.assign(shift_number = away_shifts.shift_number.astype(int)).groupby('name')['shift_number'].max().reset_index().rename(columns = {'shift_number':'prior_max_shift'})
1145
+ )
1146
+
1147
+ shifts_needing_to_be_added = shifts_needing_to_be_added.assign(shift_number = shifts_needing_to_be_added.prior_max_shift + 1)
1148
+
1149
+ shifts_needing_to_be_added = shifts_needing_to_be_added.loc[:, ['shift_number', 'period', 'shift_start', 'shift_end', 'duration', 'name', 'number', 'team', 'venue']]
1150
+
1151
+ shifts_needing_to_be_added['number'] = shifts_needing_to_be_added['number'].astype(int)
1152
+
1153
+ away_shifts = pd.concat([away_shifts, shifts_needing_to_be_added]).sort_values(by = ['number', 'period', 'shift_number'])
1154
+
1155
+ # Additional logic to handle period 1 scrape when we don't have goalie shifts yet.
1156
+
1157
+ if len(home_shifts[(home_shifts.name.isin(goalie_names))]) == 0 or len(away_shifts[(away_shifts.name.isin(goalie_names))]):
1158
+
1159
+ pbp_html_url = f'https://www.nhl.com/scores/htmlreports/{season}/GS0{game_id}.HTM'
1160
+ pbp_soup = BeautifulSoup(requests.get(pbp_html_url).content)
1161
+ goalie_header = pbp_soup.find('td', text='GOALTENDER SUMMARY')
1162
+
1163
+ # Navigate to the table containing goalie data
1164
+ goalie_table = goalie_header.find_next('table')
1165
+
1166
+ away_teams = pd.read_html(str(goalie_table))[0][:2]
1167
+ away_team = away_teams[0].iloc[0]
1168
+
1169
+ away_goalies = pd.read_html(str(goalie_table))[0][2:4]
1170
+ away_goalies = away_goalies[~pd.isna(away_goalies[6])]
1171
+
1172
+ away_goalies = away_goalies.assign(team = away_team).rename(columns = {0:'number', 2:'name', 6:'TOI'}).loc[:, ['number', 'name', 'TOI', 'team']]
1173
+
1174
+ home_teams = pd.read_html(str(goalie_table))[0][6:7]
1175
+ home_team = home_teams[0].iloc[0]
1176
+
1177
+ home_goalies = pd.read_html(str(goalie_table))[0][8:10]
1178
+ home_goalies = home_goalies[~pd.isna(home_goalies[6])]
1179
+
1180
+ home_goalies = home_goalies.assign(team = home_team).rename(columns = {0:'number', 2:'name', 6:'TOI'}).loc[:, ['number', 'name', 'TOI', 'team']]
1181
+
1182
+ home_goalies = pd.read_html(str(goalie_table))[0][8:9]
1183
+
1184
+ # Temporary to test. Will fix later.
1185
+
1186
+ # home_goalies = home_goalies.assign(TOI = '11:26')
1187
+ # away_goalies = away_goalies.assign(TOI = '11:26')
1188
+
1189
+ if len(home_shifts[(home_shifts.name.isin(goalie_names))]) == 0:
1190
+
1191
+ home_goalie_shift = home_goalies.assign(shift_number = 1,
1192
+ period = 1,
1193
+ name = home_goalies.name.str.split(', ').str[1] + ' ' + home_goalies.name.str.split(', ').str[0],
1194
+ shift_start = '0:00 / 20:00',
1195
+ shift_end = home_goalies.TOI + ' / ' + home_goalies.TOI.apply(lambda x: subtract_from_twenty_minutes(x)),
1196
+ duration = home_goalies.TOI,
1197
+ venue = 'home').loc[
1198
+ :, ['shift_number', 'period', 'shift_start', 'shift_end', 'duration', 'name', 'number', 'team', 'venue']]
1199
+
1200
+ home_goalie_shift = home_goalie_shift.assign(period = home_goalie_shift.period.astype(int),
1201
+ shift_number = home_goalie_shift.shift_number.astype(int),
1202
+ number = home_goalie_shift.number.astype(int))
1203
+
1204
+ home_shifts = pd.concat([home_shifts, home_goalie_shift]).sort_values(by = ['number', 'period', 'shift_number'])
1205
+
1206
+ if len(away_shifts[(away_shifts.name.isin(goalie_names))]) == 0:
1207
+
1208
+ away_goalie_shift = away_goalies.assign(shift_number = 1,
1209
+ period = 1,
1210
+ name = away_goalies.name.str.split(', ').str[1] + ' ' + away_goalies.name.str.split(', ').str[0],
1211
+ shift_start = '0:00 / 20:00',
1212
+ shift_end = away_goalies.TOI + ' / ' + away_goalies.TOI.apply(lambda x: subtract_from_twenty_minutes(x)),
1213
+ duration = away_goalies.TOI,
1214
+ venue = 'away').loc[
1215
+ :, ['shift_number', 'period', 'shift_start', 'shift_end', 'duration', 'name', 'number', 'team', 'venue']]
1216
+
1217
+ away_goalie_shift = away_goalie_shift.assign(period = away_goalie_shift.period.astype(int),
1218
+ shift_number = away_goalie_shift.shift_number.astype(int),
1219
+ number = away_goalie_shift.number.astype(int))
1220
+
1221
+ away_shifts = pd.concat([away_shifts, away_goalie_shift]).sort_values(by = ['number', 'period', 'shift_number'])
1222
+
1028
1223
  global all_shifts
1029
1224
 
1030
1225
  all_shifts = pd.concat([home_shifts, away_shifts])
1226
+
1227
+ #all_shifts.to_csv('/Users/patrickbacon/compact_topdownhockey/all_shifts.csv', index = False)
1031
1228
 
1032
1229
  all_shifts = all_shifts.assign(start_time = all_shifts.shift_start.str.split('/').str[0])
1033
1230
 
@@ -1232,8 +1429,20 @@ def scrape_html_shifts(season, game_id):
1232
1429
 
1233
1430
  all_shifts['name'] = all_shifts['name'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8').str.upper()
1234
1431
 
1432
+ # Apply regex to remove (A) and (C) designations at end of names
1433
+ all_shifts['name'] = all_shifts['name'].apply(lambda x: re.sub(r' \(A\)$', '', x).strip())
1434
+ all_shifts['name'] = all_shifts['name'].apply(lambda x: re.sub(r' \(C\)$', '', x).strip())
1435
+
1436
+ # Apply specific name corrections
1235
1437
  all_shifts['name'] = np.where(all_shifts['name']== "JURAJ SLAFKOVSKA" , "JURAJ SLAFKOVSKY", all_shifts['name']) # Need to do this after normalization, only then he becomes Slafkovska?
1438
+ all_shifts['name'] = np.where(all_shifts['name']== "JOHN (JACK) ROSLOVIC" , "JACK ROSLOVIC", all_shifts['name'])
1439
+ all_shifts['name'] = np.where(all_shifts['name']== "ANTHONY-JOHN (AJ) GREER" , "A.J. GREER", all_shifts['name'])
1440
+
1441
+ all_shifts['name'] = np.where(all_shifts['name']== 'MARTIN FEHARVARY' , 'MARTIN FEHERVARY', all_shifts['name'])
1442
+
1443
+ all_shifts['name'] = np.where(all_shifts['name']== 'MATAJ BLAMEL' , 'MATAJ BLAMEL', all_shifts['name'])
1236
1444
 
1445
+ all_shifts['name'] = all_shifts['name'].str.replace(' ', ' ')
1237
1446
 
1238
1447
  all_shifts = all_shifts.assign(end_time = np.where(pd.to_datetime(all_shifts.start_time).dt.time > pd.to_datetime(all_shifts.end_time).dt.time, '20:00', all_shifts.end_time),
1239
1448
  goalie = np.where(all_shifts.name.isin(goalie_names), 1, 0))
@@ -1265,10 +1474,16 @@ def scrape_html_shifts(season, game_id):
1265
1474
  global changes_on
1266
1475
  global changes_off
1267
1476
  myshifts = all_shifts
1477
+ #myshifts.to_csv('/Users/patrickbacon/compact_topdownhockey/tmp.csv', index = False)
1478
+ #print('Printing my shifts')
1479
+
1480
+ #print(myshifts)
1268
1481
 
1269
1482
  myshifts.start_time = myshifts.start_time.str.strip()
1270
1483
  myshifts.end_time = myshifts.end_time.str.strip()
1271
1484
 
1485
+ myshifts['number'] = myshifts.number.astype(str)
1486
+
1272
1487
  changes_on = myshifts.groupby(['team', 'period', 'start_time']).agg(
1273
1488
  on = ('name', ', '.join),
1274
1489
  on_numbers = ('number', ', '.join),
@@ -1293,6 +1508,8 @@ def scrape_html_shifts(season, game_id):
1293
1508
  3900))
1294
1509
 
1295
1510
  full_changes = full_changes.assign(team = np.where(full_changes.team=='CANADIENS MONTREAL', 'MONTREAL CANADIENS', full_changes.team))
1511
+
1512
+ full_changes = full_changes.assign(team = np.where(full_changes.team=='MONTRÉAL CANADIENS', 'MONTREAL CANADIENS', full_changes.team))
1296
1513
 
1297
1514
  return full_changes.reset_index(drop = True)#.drop(columns = ['time', 'period_seconds'])
1298
1515
 
@@ -1559,7 +1776,20 @@ def scrape_api_events(game_id, drop_description = True, shift_to_espn = False):
1559
1776
 
1560
1777
  api_events['ep1_name'] = api_events['ep1_name'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8').str.upper()
1561
1778
 
1779
+ # Apply regex to remove (A) and (C) designations at end of names
1780
+ api_events['ep1_name'] = api_events['ep1_name'].apply(lambda x: re.sub(r' \(A\)$', '', x).strip())
1781
+ api_events['ep1_name'] = api_events['ep1_name'].apply(lambda x: re.sub(r' \(C\)$', '', x).strip())
1782
+
1783
+ # Apply specific name corrections
1562
1784
  api_events['ep1_name'] = np.where(api_events['ep1_name'] == "JURAJ SLAFKOVSKA" , "JURAJ SLAFKOVSKY", api_events['ep1_name']) # Need to do this after normalization, only then he becomes Slafkovska?
1785
+ api_events['ep1_name'] = np.where(api_events['ep1_name'] == "JOHN (JACK) ROSLOVIC" , "JACK ROSLOVIC", api_events['ep1_name'])
1786
+ api_events['ep1_name'] = np.where(api_events['ep1_name'] == "ANTHONY-JOHN (AJ) GREER" , "A.J. GREER", api_events['ep1_name'])
1787
+
1788
+ api_events['ep1_name'] = np.where(api_events['ep1_name'] == 'MARTIN FEHARVARY' , 'MARTIN FEHERVARY', api_events['ep1_name'])
1789
+
1790
+ api_events['ep1_name'] = np.where(api_events['ep1_name'] == 'MATAJ BLAMEL' , 'MATAJ BLAMEL', api_events['ep1_name'])
1791
+
1792
+ api_events['ep1_name'] = api_events['ep1_name'].str.replace(' ', ' ')
1563
1793
 
1564
1794
  api_events = api_events.assign(ep1_name = np.where(api_events.ep1_name=='ALEX BARRÉ-BOULET', 'ALEX BARRE_BOULET', api_events.ep1_name))
1565
1795
 
@@ -1679,8 +1909,9 @@ def scrape_html_events(season, game_id):
1679
1909
  game.home_team_abbreviated.iloc[0],
1680
1910
  game.away_team_abbreviated.iloc[0]))
1681
1911
 
1682
- roster = roster.assign(teamnum = roster.team_abbreviated + roster['#'],
1683
- Name = roster.Name.str.split('(').str[0].str.strip())
1912
+ roster = roster.assign(teamnum = roster.team_abbreviated + roster['#'])
1913
+ roster['Name'] = roster.Name.apply(lambda x: re.sub(r' \(A\)$', '', x).strip())
1914
+ roster['Name'] = roster.Name.apply(lambda x: re.sub(r' \(C\)$', '', x).strip())
1684
1915
 
1685
1916
  event_player_1s = roster.loc[:, ['teamnum', 'Name']].rename(columns = {'teamnum':'event_player_1', 'Name':'ep1_name'})
1686
1917
  event_player_2s = roster.loc[:, ['teamnum', 'Name']].rename(columns = {'teamnum':'event_player_2', 'Name':'ep2_name'})
@@ -1743,6 +1974,9 @@ def scrape_html_events(season, game_id):
1743
1974
 
1744
1975
  game = game.assign(home_team = np.where(game.home_team=='CANADIENS MONTREAL', 'MONTREAL CANADIENS', game.home_team),
1745
1976
  away_team = np.where(game.away_team=='CANADIENS MONTREAL', 'MONTREAL CANADIENS', game.away_team))
1977
+
1978
+ game = game.assign(home_team = np.where(game.home_team=='MONTRÉAL CANADIENS', 'MONTREAL CANADIENS', game.home_team),
1979
+ away_team = np.where(game.away_team=='MONTRÉAL CANADIENS', 'MONTREAL CANADIENS', game.away_team))
1746
1980
 
1747
1981
  if int(game_id[0])!=3:
1748
1982
  game = game[game.game_seconds<4000]
@@ -1940,7 +2174,8 @@ def scrape_espn_events(espn_game_id, drop_description = True):
1940
2174
  (np.where(espn_events['event_player_1']== "EMIL LILLEBERG" , "EMIL MARTINSEN LILLEBERG",
1941
2175
  (np.where(espn_events['event_player_1']== "CAMERON ATKINSON" , "CAM ATKINSON",
1942
2176
  (np.where(espn_events['event_player_1']== "JURAJ SLAFKOVSKA" , "JURAJ SLAFKOVSKY",
1943
- espn_events['event_player_1']))))))))))))))))))))))))))))))))))
2177
+ (np.where(espn_events['event_player_1']== "MARTIN FEHARVARY" , "MARTIN FEHERVARY",
2178
+ espn_events['event_player_1']))))))))))))))))))))))))))))))))))))
1944
2179
 
1945
2180
 
1946
2181
  espn_events = espn_events.assign(version =
@@ -1974,8 +2209,21 @@ def scrape_espn_events(espn_game_id, drop_description = True):
1974
2209
 
1975
2210
  espn_events['event_player_1'] = espn_events['event_player_1'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8').str.upper()
1976
2211
 
1977
- espn_events['event_player_1'] = np.where(espn_events['event_player_1'] == "JURAJ SLAFKOVSKA" , "JURAJ SLAFKOVSKY", espn_events['event_player_1'])
2212
+ # Apply regex to remove (A) and (C) designations at end of names
2213
+ espn_events['event_player_1'] = espn_events['event_player_1'].apply(lambda x: re.sub(r' \(A\)$', '', x).strip())
2214
+ espn_events['event_player_1'] = espn_events['event_player_1'].apply(lambda x: re.sub(r' \(C\)$', '', x).strip())
2215
+
2216
+ # Apply specific name corrections
2217
+ espn_events['event_player_1'] = np.where(espn_events['event_player_1'] == "JURAJ SLAFKOVSKA" , "JURAJ SLAFKOVSKY", espn_events['event_player_1'])
2218
+ espn_events['event_player_1'] = np.where(espn_events['event_player_1'] == "JOHN (JACK) ROSLOVIC" , "JACK ROSLOVIC", espn_events['event_player_1'])
2219
+ espn_events['event_player_1'] = np.where(espn_events['event_player_1'] == "ANTHONY-JOHN (AJ) GREER" , "A.J. GREER", espn_events['event_player_1'])
2220
+
2221
+ espn_events['event_player_1'] = np.where(espn_events['event_player_1'] == 'MARTIN FEHARVARY' , 'MARTIN FEHERVARY', espn_events['event_player_1'])
2222
+
2223
+ espn_events['event_player_1'] = np.where(espn_events['event_player_1'] == 'MATAJ BLAMEL' , 'MATAJ BLAMEL', espn_events['event_player_1'])
1978
2224
 
2225
+ espn_events['event_player_1'] = espn_events['event_player_1'].str.replace(' ', ' ')
2226
+
1979
2227
  #espn_events = espn_events.assign(event_player_1 = np.where(
1980
2228
  #espn_events.event_player_1=='ALEX BURROWS', 'ALEXANDRE BURROWS', espn_events.event_player_1))
1981
2229
 
@@ -2075,9 +2323,11 @@ def scrape_espn_ids_single_game(game_date, home_team, away_team):
2075
2323
  np.where(gamedays.away_team=='AVALANCHE', 'COL',
2076
2324
  np.where(gamedays.away_team=='BLUE', 'CBJ',
2077
2325
  np.where(gamedays.away_team=='JACKETS', 'CBJ',
2326
+ np.where(gamedays.away_team=='BLUE JACKETS', 'CBJ',
2078
2327
  np.where(gamedays.away_team=='STARS', 'DAL',
2079
2328
  np.where(gamedays.away_team=='RED', 'DET',
2080
2329
  np.where(gamedays.away_team=='WINGS', 'DET',
2330
+ np.where(gamedays.away_team=='RED WINGS', 'DET',
2081
2331
  np.where(gamedays.away_team=='OILERS', 'EDM',
2082
2332
  np.where(gamedays.away_team=='PANTHERS', 'FLA',
2083
2333
  np.where(gamedays.away_team=='KINGS', 'LAK',
@@ -2096,15 +2346,17 @@ def scrape_espn_ids_single_game(game_date, home_team, away_team):
2096
2346
  np.where(gamedays.away_team=='LIGHTNING', 'TBL',
2097
2347
  np.where(gamedays.away_team=='LEAFS', 'TOR',
2098
2348
  np.where(gamedays.away_team=='MAPLE', 'TOR',
2349
+ np.where(gamedays.away_team=='MAPLE LEAFS', 'TOR',
2099
2350
  np.where(gamedays.away_team=='CANUCKS', 'VAN',
2100
2351
  np.where(gamedays.away_team=='GOLDEN', 'VGK',
2101
2352
  np.where(gamedays.away_team=='KNIGHTS', 'VGK',
2353
+ np.where(gamedays.away_team=='GOLDEN KNIGHTS', 'VGK',
2102
2354
  np.where(gamedays.away_team=='CAPITALS', 'WSH',
2103
2355
  np.where(gamedays.away_team=='JETS', 'WPG',
2104
2356
  np.where(gamedays.away_team=='CLUB', 'UTA',
2357
+ np.where(gamedays.away_team=='MAMMOTH', 'UTA',
2105
2358
  np.where(gamedays.away_team=='HOCKEY', 'UTA', 'mistake'
2106
- ))))))))))))))))))))))))))))))))))))))
2107
- )
2359
+ ))))))))))))))))))))))))))))))))))))))))))))
2108
2360
 
2109
2361
  gamedays = gamedays.assign(
2110
2362
  home_team = np.where(gamedays.home_team=='DUCKS', 'ANA',
@@ -2117,9 +2369,11 @@ def scrape_espn_ids_single_game(game_date, home_team, away_team):
2117
2369
  np.where(gamedays.home_team=='AVALANCHE', 'COL',
2118
2370
  np.where(gamedays.home_team=='BLUE', 'CBJ',
2119
2371
  np.where(gamedays.home_team=='JACKETS', 'CBJ',
2372
+ np.where(gamedays.home_team=='BLUE JACKETS', 'CBJ',
2120
2373
  np.where(gamedays.home_team=='STARS', 'DAL',
2121
2374
  np.where(gamedays.home_team=='RED', 'DET',
2122
2375
  np.where(gamedays.home_team=='WINGS', 'DET',
2376
+ np.where(gamedays.home_team=='RED WINGS', 'DET',
2123
2377
  np.where(gamedays.home_team=='OILERS', 'EDM',
2124
2378
  np.where(gamedays.home_team=='PANTHERS', 'FLA',
2125
2379
  np.where(gamedays.home_team=='KINGS', 'LAK',
@@ -2138,15 +2392,17 @@ def scrape_espn_ids_single_game(game_date, home_team, away_team):
2138
2392
  np.where(gamedays.home_team=='LIGHTNING', 'TBL',
2139
2393
  np.where(gamedays.home_team=='MAPLE', 'TOR',
2140
2394
  np.where(gamedays.home_team=='LEAFS', 'TOR',
2395
+ np.where(gamedays.home_team=='MAPLE LEAFS', 'TOR',
2141
2396
  np.where(gamedays.home_team=='CANUCKS', 'VAN',
2142
2397
  np.where(gamedays.home_team=='GOLDEN', 'VGK',
2143
2398
  np.where(gamedays.home_team=='KNIGHTS', 'VGK',
2399
+ np.where(gamedays.home_team=='GOLDEN KNIGHTS', 'VGK',
2144
2400
  np.where(gamedays.home_team=='CAPITALS', 'WSH',
2145
2401
  np.where(gamedays.home_team=='JETS', 'WPG',
2146
2402
  np.where(gamedays.home_team=='CLUB', 'UTA',
2403
+ np.where(gamedays.home_team=='MAMMOTH', 'UTA',
2147
2404
  np.where(gamedays.home_team=='HOCKEY', 'UTA', 'mistake'
2148
- ))))))))))))))))))))))))))))))))))))))
2149
- )
2405
+ ))))))))))))))))))))))))))))))))))))))))))))
2150
2406
 
2151
2407
  gamedays = gamedays[(gamedays.game_date==this_date) & (gamedays.home_team==home_team) & (gamedays.away_team==away_team)]
2152
2408
 
@@ -2159,9 +2415,23 @@ def merge_and_prepare(events, shifts):
2159
2415
  game_id = int(events.game_id.iloc[0])
2160
2416
 
2161
2417
  merged = pd.concat([events, shifts])
2418
+
2419
+ home_team = merged[~(pd.isna(merged.home_team))].home_team.iloc[0]
2420
+ #print(home_team)
2421
+ away_team = merged[~(pd.isna(merged.away_team))].away_team.iloc[0]
2422
+ #print(away_team)
2423
+
2424
+ if 'CANADIENS' in home_team:
2425
+ home_team = 'MONTREAL CANADIENS'
2426
+
2427
+ if 'CANADIENS' in away_team:
2428
+ away_team = 'MONTREAL CANADIENS'
2429
+
2430
+ #print(home_team)
2431
+ #print(away_team)
2162
2432
 
2163
- merged = merged.assign(home_team = merged[~(pd.isna(merged.home_team))].home_team.iloc[0],
2164
- away_team = merged[~(pd.isna(merged.away_team))].away_team.iloc[0],
2433
+ merged = merged.assign(home_team = home_team,
2434
+ away_team = away_team,
2165
2435
  home_team_abbreviated = merged[~(pd.isna(merged.home_team_abbreviated))].home_team_abbreviated.iloc[0],
2166
2436
  away_team_abbreviated = merged[~(pd.isna(merged.away_team_abbreviated))].away_team_abbreviated.iloc[0])
2167
2437
 
@@ -2211,6 +2481,8 @@ def merge_and_prepare(events, shifts):
2211
2481
  np.where(merged.event.isin(['PGSTR', 'PGEND', 'PSTR', 'PEND', 'ANTHEM']), -1, 1))).sort_values(
2212
2482
  by = ['game_seconds', 'period', 'event_index'])
2213
2483
 
2484
+ merged.to_csv('/Users/patrickbacon/compact_topdownhockey/first_merged.csv', index = False)
2485
+
2214
2486
  merged = merged.assign(change_before_event = np.where(
2215
2487
  (
2216
2488
  (merged.away_on_ice!='') & (merged.event.shift()=='CHANGE') & (merged.away_on_ice!=merged.away_on_ice.shift()) |
@@ -2242,8 +2514,9 @@ def merge_and_prepare(events, shifts):
2242
2514
  merged.home_team_abbreviated.iloc[0],
2243
2515
  merged.away_team_abbreviated.iloc[0]))
2244
2516
 
2245
- roster = roster.assign(teamnum = roster.team_abbreviated + roster['#'],
2246
- Name = roster.Name.str.split('(').str[0].str.strip())
2517
+ roster = roster.assign(teamnum = roster.team_abbreviated + roster['#'])
2518
+ roster['Name'] = roster.Name.apply(lambda x: re.sub(r' \(A\)$', '', x).strip())
2519
+ roster['Name'] = roster.Name.apply(lambda x: re.sub(r' \(C\)$', '', x).strip())
2247
2520
 
2248
2521
  roster = roster.assign(Name = np.where((roster.Name=='SEBASTIAN AHO') &( roster.team_name == 'NEW YORK ISLANDERS'), 'SEBASTIAN AHO (SWE)', roster.Name))
2249
2522
  roster = roster.assign(Name = np.where((roster.Name=='ELIAS PETTERSSON') &( roster.Pos == 'D'), 'ELIAS PETTERSSON(D)', roster.Name))
@@ -2562,7 +2835,7 @@ def fix_missing(single, event_coords, events):
2562
2835
 
2563
2836
  return(events)
2564
2837
 
2565
- def full_scrape_1by1(game_id_list, shift_to_espn = True):
2838
+ def full_scrape_1by1(game_id_list, live = False, shift_to_espn = True):
2566
2839
 
2567
2840
  global single
2568
2841
  global event_coords
@@ -2585,7 +2858,9 @@ def full_scrape_1by1(game_id_list, shift_to_espn = True):
2585
2858
  print('Attempting scrape for: ' + str(game_id))
2586
2859
  season = str(int(str(game_id)[:4])) + str(int(str(game_id)[:4]) + 1)
2587
2860
  small_id = str(game_id)[5:]
2861
+ print('Scraping HTML events')
2588
2862
  single = scrape_html_events(season, small_id)
2863
+ print('Scraped HTML events')
2589
2864
  single['game_id'] = int(game_id)
2590
2865
 
2591
2866
  # If all goes well with the HTML scrape:
@@ -2604,7 +2879,7 @@ def full_scrape_1by1(game_id_list, shift_to_espn = True):
2604
2879
  print('Issue when fixing problematic events. Here it is: ' + str(e))
2605
2880
  continue
2606
2881
  try:
2607
- shifts = scrape_html_shifts(season, small_id)
2882
+ shifts = scrape_html_shifts(season, small_id, live)
2608
2883
  finalized = merge_and_prepare(events, shifts)
2609
2884
  full = full._append(finalized)
2610
2885
  second_time = time.time()
@@ -2632,8 +2907,26 @@ def full_scrape_1by1(game_id_list, shift_to_espn = True):
2632
2907
  home_team = single['home_team_abbreviated'].iloc[0]
2633
2908
  away_team = single['away_team_abbreviated'].iloc[0]
2634
2909
  game_date = single['game_date'].iloc[0]
2910
+ espn_home_team = home_team
2911
+ espn_away_team = away_team
2635
2912
  try:
2636
- espn_id = scrape_espn_ids_single_game(str(game_date.date()), home_team, away_team).espn_id.iloc[0]
2913
+ if home_team == 'T.B':
2914
+ espn_home_team = 'TBL'
2915
+ if away_team == 'T.B':
2916
+ espn_away_team = 'TBL'
2917
+ if home_team == 'L.A':
2918
+ espn_home_team = 'LAK'
2919
+ if away_team == 'L.A':
2920
+ espn_away_team = 'LAK'
2921
+ if home_team == 'N.J':
2922
+ espn_home_team = 'NJD'
2923
+ if away_team == 'N.J':
2924
+ espn_away_team = 'NJD'
2925
+ if home_team == 'S.J':
2926
+ espn_home_team = 'SJS'
2927
+ if away_team == 'S.J':
2928
+ espn_away_team = 'SJS'
2929
+ espn_id = scrape_espn_ids_single_game(str(game_date.date()), espn_home_team, espn_away_team).espn_id.iloc[0]
2637
2930
  event_coords = scrape_espn_events(int(espn_id))
2638
2931
  event_coords['coordinate_source'] = 'espn'
2639
2932
  events = single.merge(event_coords, on = ['event_player_1', 'game_seconds', 'period', 'version', 'event'], how = 'left').drop(columns = ['espn_id'])
@@ -2646,7 +2939,7 @@ def full_scrape_1by1(game_id_list, shift_to_espn = True):
2646
2939
  print('This game does not have ESPN or API coordinates. You will get it anyway, though.')
2647
2940
  events = single
2648
2941
  try:
2649
- shifts = scrape_html_shifts(season, small_id)
2942
+ shifts = scrape_html_shifts(season, small_id, live)
2650
2943
  finalized = merge_and_prepare(events, shifts)
2651
2944
  full = full._append(finalized)
2652
2945
  second_time = time.time()
@@ -2739,7 +3032,7 @@ def full_scrape_1by1(game_id_list, shift_to_espn = True):
2739
3032
  events = single
2740
3033
  events['coordinate_source'] = 'none'
2741
3034
  try:
2742
- shifts = scrape_html_shifts(season, small_id)
3035
+ shifts = scrape_html_shifts(season, small_id, live)
2743
3036
  finalized = merge_and_prepare(events, shifts)
2744
3037
  full = full._append(finalized)
2745
3038
  second_time = time.time()
@@ -2895,12 +3188,46 @@ def full_scrape_1by1(game_id_list, shift_to_espn = True):
2895
3188
 
2896
3189
  return full
2897
3190
 
2898
- def full_scrape(game_id_list, shift = False):
3191
+ def full_scrape(game_id_list, live = True, shift = False):
2899
3192
 
2900
3193
  global hidden_patrick
2901
3194
  hidden_patrick = 0
2902
3195
 
2903
- df = full_scrape_1by1(game_id_list, shift_to_espn = shift)
3196
+ df = full_scrape_1by1(game_id_list, live, shift_to_espn = shift)
3197
+
3198
+ # Fixing the Pettersson issue for event player. Just going downstream for this.
3199
+ df = df.assign(
3200
+ event_player_1 = np.where(
3201
+ (df.event_player_1 == 'ELIAS PETTERSSON') &
3202
+ (df.event_description.str.contains('#', na=False)) &
3203
+ (df.event_description.str.contains(' PETTERSSON', na=False)) &
3204
+ (df.event_description.str.extract(r'#(\d+) PETTERSSON', expand=False) == '25'),
3205
+ 'ELIAS PETTERSSON(D)', df.event_player_1),
3206
+ event_player_2 = np.where(
3207
+ (df.event_player_2 == 'ELIAS PETTERSSON') &
3208
+ (
3209
+ # Goal and Petey got A1
3210
+ ((df.event_type == 'GOAL') &
3211
+ (df.event_description.str.contains(': #', na=False)) &
3212
+ (df.event_description.str.contains(' PETTERSSON', na=False)) &
3213
+ (df.event_description.str.extract(r': #(\d+) PETTERSSON', expand=False) == '25')) |
3214
+ # Not a goal, Petey was EP2
3215
+ ((df.event_type != 'GOAL') &
3216
+ (df.event_description.str.contains('VAN #', na=False)) &
3217
+ (df.event_description.str.contains(' PETTERSSON', na=False)) &
3218
+ (df.event_description.str.extract(r'VAN #(\d+) PETTERSSON', expand=False) == '25'))
3219
+ ),
3220
+ 'ELIAS PETTERSSON(D)', df.event_player_2),
3221
+ event_player_3 = np.where(
3222
+ (df.event_player_3=='ELIAS PETTERSSON') &
3223
+ (df.event_description.str.contains('#', na=False)) &
3224
+ (df.event_description.str.contains(' PETTERSSON', na=False)) &
3225
+ (df.event_description.str.extract(r'#(\d+) PETTERSSON(?:\s|$)', expand=False) == '25'),
3226
+ 'ELIAS PETTERSSON(D)', df.event_player_3)
3227
+ )
3228
+
3229
+ # Don't even need this, we've had this problem with Stutzle for years, just let it be.
3230
+ # df.event_description = df.event_description.str.replace('FEHÃ\x89RVÃ\x81RY', 'FEHERVARY').str.replace('BLÜMEL', 'BLAMEL')
2904
3231
 
2905
3232
  if (hidden_patrick==0) and (len(df)>0):
2906
3233