fugue 0.8.6.dev1__py3-none-any.whl → 0.8.6.dev2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -113,6 +113,13 @@ class DataFrame(Dataset):
113
113
  def as_pandas(self) -> pd.DataFrame:
114
114
  """Convert to pandas DataFrame"""
115
115
  pdf = pd.DataFrame(self.as_array(), columns=self.columns)
116
+ if len(pdf) == 0: # TODO: move to triad
117
+ return pd.DataFrame(
118
+ {
119
+ k: pd.Series(dtype=v.type.to_pandas_dtype())
120
+ for k, v in self.schema.items()
121
+ }
122
+ )
116
123
  return PD_UTILS.enforce_type(pdf, self.schema.pa_schema, null_safe=True)
117
124
 
118
125
  def as_arrow(self, type_safe: bool = False) -> pa.Table:
@@ -165,7 +165,7 @@ class LocalDataFrameIterableDataFrame(LocalUnboundedDataFrame):
165
165
 
166
166
  def as_pandas(self) -> pd.DataFrame:
167
167
  if self.empty:
168
- return ArrayDataFrame([], self.schema).as_pandas()
168
+ return PandasDataFrame(schema=self.schema).as_pandas()
169
169
 
170
170
  return pd.concat(df.as_pandas() for df in self.native)
171
171
 
@@ -23,9 +23,10 @@ from triad.collections.function_wrapper import (
23
23
  from triad.utils.iter import EmptyAwareIterable, make_empty_aware
24
24
 
25
25
  from ..constants import FUGUE_ENTRYPOINT
26
+ from ..dataset.api import count as df_count
26
27
  from .array_dataframe import ArrayDataFrame
27
28
  from .arrow_dataframe import ArrowDataFrame
28
- from .dataframe import DataFrame, LocalDataFrame
29
+ from .dataframe import AnyDataFrame, DataFrame, LocalDataFrame, as_fugue_df
29
30
  from .dataframe_iterable_dataframe import (
30
31
  IterableArrowDataFrame,
31
32
  IterablePandasDataFrame,
@@ -172,6 +173,19 @@ class DataFrameParam(_DataFrameParamBase):
172
173
  return sum(1 for _ in df.as_array_iterable())
173
174
 
174
175
 
176
+ @fugue_annotated_param(AnyDataFrame)
177
+ class _AnyDataFrameParam(DataFrameParam):
178
+ def to_output_df(self, output: AnyDataFrame, schema: Any, ctx: Any) -> DataFrame:
179
+ return (
180
+ as_fugue_df(output)
181
+ if schema is None
182
+ else as_fugue_df(output, schema=schema)
183
+ )
184
+
185
+ def count(self, df: Any) -> int:
186
+ return df_count(df)
187
+
188
+
175
189
  @fugue_annotated_param(LocalDataFrame, "l", child_can_reuse_code=True)
176
190
  class LocalDataFrameParam(DataFrameParam):
177
191
  def to_input_data(self, df: DataFrame, ctx: Any) -> LocalDataFrame:
@@ -333,6 +347,9 @@ class _PandasParam(LocalDataFrameParam):
333
347
 
334
348
  @no_type_check
335
349
  def to_output_df(self, output: pd.DataFrame, schema: Any, ctx: Any) -> DataFrame:
350
+ _schema: Optional[Schema] = None if schema is None else Schema(schema)
351
+ if _schema is not None and _schema.names != list(output.columns):
352
+ output = output[_schema.names]
336
353
  return PandasDataFrame(output, schema)
337
354
 
338
355
  @no_type_check
@@ -361,8 +378,15 @@ class _IterablePandasParam(LocalDataFrameParam):
361
378
  self, output: Iterable[pd.DataFrame], schema: Any, ctx: Any
362
379
  ) -> DataFrame:
363
380
  def dfs():
381
+ _schema: Optional[Schema] = None if schema is None else Schema(schema)
382
+ has_return = False
364
383
  for df in output:
365
- yield PandasDataFrame(df, schema)
384
+ if _schema is not None and _schema.names != list(df.columns):
385
+ df = df[_schema.names]
386
+ yield PandasDataFrame(df, _schema)
387
+ has_return = True
388
+ if not has_return and _schema is not None:
389
+ yield PandasDataFrame(schema=_schema)
366
390
 
367
391
  return IterablePandasDataFrame(dfs())
368
392
 
@@ -381,7 +405,12 @@ class _PyArrowTableParam(LocalDataFrameParam):
381
405
 
382
406
  def to_output_df(self, output: Any, schema: Any, ctx: Any) -> DataFrame:
383
407
  assert isinstance(output, pa.Table)
384
- return ArrowDataFrame(output, schema=schema)
408
+ adf: DataFrame = ArrowDataFrame(output)
409
+ if schema is not None:
410
+ _schema = Schema(schema)
411
+ if adf.schema != _schema:
412
+ adf = adf[_schema.names].alter_columns(_schema)
413
+ return adf
385
414
 
386
415
  def count(self, df: Any) -> int: # pragma: no cover
387
416
  return df.count()
@@ -409,13 +438,15 @@ class _IterableArrowParam(LocalDataFrameParam):
409
438
  ) -> DataFrame:
410
439
  def dfs():
411
440
  _schema: Optional[Schema] = None if schema is None else Schema(schema)
441
+ has_return = False
412
442
  for df in output:
413
- adf = ArrowDataFrame(df)
414
- if _schema is not None and not ( # pylint: disable-all
415
- adf.schema == schema
416
- ):
443
+ adf: DataFrame = ArrowDataFrame(df)
444
+ if _schema is not None and adf.schema != _schema:
417
445
  adf = adf[_schema.names].alter_columns(_schema)
418
446
  yield adf
447
+ has_return = True
448
+ if not has_return and _schema is not None:
449
+ yield ArrowDataFrame(schema=_schema)
419
450
 
420
451
  return IterableArrowDataFrame(dfs())
421
452
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: fugue
3
- Version: 0.8.6.dev1
3
+ Version: 0.8.6.dev2
4
4
  Summary: An abstraction layer for distributed computation
5
5
  Home-page: http://github.com/fugue-project/fugue
6
6
  Author: The Fugue Development Team
@@ -43,14 +43,14 @@ Requires-Dist: pyarrow (>=6.0.1) ; extra == 'all'
43
43
  Requires-Dist: polars ; extra == 'all'
44
44
  Requires-Dist: dask[dataframe,distributed] ; (python_version < "3.8") and extra == 'all'
45
45
  Requires-Dist: ibis-framework (>=2.1.1) ; (python_version < "3.8") and extra == 'all'
46
- Requires-Dist: dask[dataframe,distributed] (>=2022.9.0) ; (python_version >= "3.8") and extra == 'all'
46
+ Requires-Dist: dask[dataframe,distributed] (<2023.7.1,>=2022.9.0) ; (python_version >= "3.8") and extra == 'all'
47
47
  Requires-Dist: ibis-framework (<6,>=3.2.0) ; (python_version >= "3.8") and extra == 'all'
48
48
  Provides-Extra: cpp_sql_parser
49
49
  Requires-Dist: fugue-sql-antlr[cpp] (>=0.1.6) ; extra == 'cpp_sql_parser'
50
50
  Provides-Extra: dask
51
51
  Requires-Dist: qpd[dask] (>=0.4.4) ; extra == 'dask'
52
52
  Requires-Dist: dask[dataframe,distributed] ; (python_version < "3.8") and extra == 'dask'
53
- Requires-Dist: dask[dataframe,distributed] (>=2022.9.0) ; (python_version >= "3.8") and extra == 'dask'
53
+ Requires-Dist: dask[dataframe,distributed] (<2023.7.1,>=2022.9.0) ; (python_version >= "3.8") and extra == 'dask'
54
54
  Provides-Extra: duckdb
55
55
  Requires-Dist: duckdb (>=0.5.0) ; extra == 'duckdb'
56
56
  Requires-Dist: pyarrow (>=6.0.1) ; extra == 'duckdb'
@@ -28,10 +28,10 @@ fugue/dataframe/__init__.py,sha256=zm7TbsaJLIvfm7zymWm2LGcuJd3nxfGsFnQiyrSnenM,6
28
28
  fugue/dataframe/api.py,sha256=c5Err3c-ayl-k28IUi6kV_ClDWX30NpVNkv97hQKDac,9862
29
29
  fugue/dataframe/array_dataframe.py,sha256=oBfN545NTGdYJ5zPIRv7hXRR-R_OW1JieyOfnl296oU,4447
30
30
  fugue/dataframe/arrow_dataframe.py,sha256=h0DJH8G0MEgfkyvX4U957iqDXIgvTtrP7YED5iEjizI,12098
31
- fugue/dataframe/dataframe.py,sha256=rIjaOplyRDt_BfpFwZWMjZALDqa03NePs9tFiLSW6Jg,17247
32
- fugue/dataframe/dataframe_iterable_dataframe.py,sha256=0gvb12D0s4VntNZ-M8J_Pic2XlFpw7upXf4hfMn2ufY,7255
31
+ fugue/dataframe/dataframe.py,sha256=XOn9x_aKWOLdpach7RHxg2PoP-hpfsyjoRFvLx6xKV0,17496
32
+ fugue/dataframe/dataframe_iterable_dataframe.py,sha256=lx71KfaI4lsVKI-79buc-idaeT20JEMBOq21SQcAiY8,7259
33
33
  fugue/dataframe/dataframes.py,sha256=tBSpHsENgbcdOJ0Jgst6PTKbjG7_uoFJch96oTlaQIs,4160
34
- fugue/dataframe/function_wrapper.py,sha256=DjyIrNBj5Bv9AwIl2I2fG5ClcKe3OoW_eBkyEabYY5Y,13505
34
+ fugue/dataframe/function_wrapper.py,sha256=r6H1SQWaag2eSbJ50327t_bt7MZunbOMOl9OcOcQW2E,14827
35
35
  fugue/dataframe/iterable_dataframe.py,sha256=Kn5HZnVU4o1nn9mbbQxaV8rGG869wImZcOCK3AdlA-M,4627
36
36
  fugue/dataframe/pandas_dataframe.py,sha256=ZWqI-ZUFiSP7giJ3siRlrZcMedI_fyuoLn227H0YRvw,10453
37
37
  fugue/dataframe/utils.py,sha256=nQVU01jspB1NSeRiagE71uzRibDqvyGwi94ZfHwNHD0,10508
@@ -131,7 +131,7 @@ fugue_ray/_utils/io.py,sha256=gHfx70tdXPHmVL6nHxVhmCO5KpWjjyDG8qKT1Lbpav4,8737
131
131
  fugue_spark/__init__.py,sha256=rvrMpFs9socMgyH_58gLbnAqmirBf5oidXoO4cekW6U,165
132
132
  fugue_spark/_constants.py,sha256=K2uLQfjvMxXk75K-7_Wn47Alpwq5rW57BtECAUrOeqA,177
133
133
  fugue_spark/dataframe.py,sha256=xoM2-SwVRFfSyfEEnx4g4b0GO6XgN-DQLvXtUrAMq1Q,9510
134
- fugue_spark/execution_engine.py,sha256=n_ZdQI4ysbgdDsjyWwbwhyBsgcg_icc9MAmVyWEn2RQ,32029
134
+ fugue_spark/execution_engine.py,sha256=nssgfqt2h1OjGlK5iuGEyF-lt2pEokmsjrqo6K4C1Kg,32440
135
135
  fugue_spark/ibis_engine.py,sha256=Yl5xxwROo1idcD2hFaylaI1IpmBUgbvOZRWtcrE0Zjo,1697
136
136
  fugue_spark/registry.py,sha256=kyIMk6dAiKRSKCHawQKyXu9DhZ24T6j3gL57TiOAZ8c,4162
137
137
  fugue_spark/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -143,14 +143,14 @@ fugue_sql/__init__.py,sha256=Cmr7w0Efr7PzoXdQzdJfc4Dgqd69qKqcHZZodENq7EU,287
143
143
  fugue_sql/exceptions.py,sha256=ltS0MC8gMnVVrJbQiOZ0kRUWvVQ2LTx33dCW3ugqtb0,260
144
144
  fugue_test/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
145
145
  fugue_test/bag_suite.py,sha256=WbDCFjuAHYoJh4GXSPiSJxOoOwE1VMtYpJ3lQrsUK-Y,2483
146
- fugue_test/builtin_suite.py,sha256=piY_bVtLylQWrZbencrQwT89IXgvgBmNscddJ6vZieY,76338
146
+ fugue_test/builtin_suite.py,sha256=sT_Btm7cpSsuLnt_PpNFeLZVc2WPmF5NQy4GKVri_-c,78396
147
147
  fugue_test/dataframe_suite.py,sha256=6uM7_et2Y55-ePIssG9G_K9mXBYNjPXnpunuCh0xKhw,19082
148
148
  fugue_test/execution_suite.py,sha256=HzM_7jUhnt1kD2jGaaPhyr-Q_vtRV4nCQTaLtMnzTuU,50948
149
149
  fugue_test/ibis_suite.py,sha256=Dk4AHVD00RcFsNm9VvJ4_4LOyFdGX30OnAtpO2SPruE,3529
150
150
  fugue_version/__init__.py,sha256=VpASnrti7EGWxUfSWGgERUfe7NLJltfVXYosOzHbpPg,22
151
- fugue-0.8.6.dev1.dist-info/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
152
- fugue-0.8.6.dev1.dist-info/METADATA,sha256=OuDeHU_GR0fSI0jISu0fczkrLCmUgnmr8IYPcEjNbz8,18264
153
- fugue-0.8.6.dev1.dist-info/WHEEL,sha256=pkctZYzUS4AYVn6dJ-7367OJZivF2e8RA9b_ZBjif18,92
154
- fugue-0.8.6.dev1.dist-info/entry_points.txt,sha256=xD0_Lj3jz-np_UNVAwsKt9kYWX0CwM7o7IvkCXRmcQY,374
155
- fugue-0.8.6.dev1.dist-info/top_level.txt,sha256=y1eCfzGdQ1_RkgcShcfbvXs-bopD3DwJcIOxP9EFXno,140
156
- fugue-0.8.6.dev1.dist-info/RECORD,,
151
+ fugue-0.8.6.dev2.dist-info/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
152
+ fugue-0.8.6.dev2.dist-info/METADATA,sha256=pGSp1pOq2BHvHd3KiRL3YzFHbNKzmKDMUPd8Hcs__xc,18284
153
+ fugue-0.8.6.dev2.dist-info/WHEEL,sha256=AtBG6SXL3KF_v0NxLf0ehyVOh0cold-JbJYXNGorC6Q,92
154
+ fugue-0.8.6.dev2.dist-info/entry_points.txt,sha256=Ta1DD9RIgS_YfhieUvM6PgAzuOYmhlnKe9fWbns9sLc,374
155
+ fugue-0.8.6.dev2.dist-info/top_level.txt,sha256=y1eCfzGdQ1_RkgcShcfbvXs-bopD3DwJcIOxP9EFXno,140
156
+ fugue-0.8.6.dev2.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.40.0)
2
+ Generator: bdist_wheel (0.41.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -2,10 +2,10 @@
2
2
  dask = fugue_dask.registry [dask]
3
3
  dask_ibis = fugue_dask.ibis_engine [dask,ibis]
4
4
  duckdb = fugue_duckdb.registry [duckdb]
5
- duckdb_ibis = fugue_duckdb.ibis_engine [ibis,duckdb]
5
+ duckdb_ibis = fugue_duckdb.ibis_engine [duckdb,ibis]
6
6
  ibis = fugue_ibis [ibis]
7
7
  polars = fugue_polars.registry [polars]
8
8
  ray = fugue_ray.registry [ray]
9
9
  spark = fugue_spark.registry [spark]
10
- spark_ibis = fugue_spark.ibis_engine [spark,ibis]
10
+ spark_ibis = fugue_spark.ibis_engine [ibis,spark]
11
11
 
@@ -134,9 +134,8 @@ class SparkMapEngine(MapEngine):
134
134
  ) -> DataFrame:
135
135
  output_schema = Schema(output_schema)
136
136
  if self._should_use_pandas_udf(output_schema):
137
- # pandas udf can only be used for pyspark > 3
138
137
  if len(partition_spec.partition_by) > 0:
139
- if partition_spec.algo == "coarse":
138
+ if partition_spec.algo in ["coarse", "even"]:
140
139
  return self._map_by_pandas_udf(
141
140
  df,
142
141
  map_func=map_func,
@@ -145,7 +144,18 @@ class SparkMapEngine(MapEngine):
145
144
  on_init=on_init,
146
145
  map_func_format_hint=map_func_format_hint,
147
146
  )
148
- elif partition_spec.algo != "even" or self.is_spark_connect:
147
+ else:
148
+ if ( # not simple partitioning
149
+ partition_spec.algo != "hash"
150
+ or partition_spec.num_partitions != "0"
151
+ ):
152
+ # TODO: not sure if presort should be done
153
+ # on physical partition level
154
+ df = self.to_df(
155
+ self.execution_engine.repartition(
156
+ df, PartitionSpec(partition_spec, presort=[])
157
+ )
158
+ )
149
159
  return self._group_map_by_pandas_udf(
150
160
  df,
151
161
  map_func=map_func,
@@ -154,7 +164,7 @@ class SparkMapEngine(MapEngine):
154
164
  on_init=on_init,
155
165
  map_func_format_hint=map_func_format_hint,
156
166
  )
157
- elif len(partition_spec.partition_by) == 0:
167
+ else:
158
168
  return self._map_by_pandas_udf(
159
169
  df,
160
170
  map_func=map_func,
@@ -22,6 +22,7 @@ from triad import SerializableRLock
22
22
 
23
23
  import fugue.api as fa
24
24
  from fugue import (
25
+ AnyDataFrame,
25
26
  ArrayDataFrame,
26
27
  CoTransformer,
27
28
  DataFrame,
@@ -365,6 +366,12 @@ class BuiltInTests(object):
365
366
  dag.output(dict(df=a), using=mock_outputter2)
366
367
  a.partition(num=3).output(MockOutputter3)
367
368
  dag.output(dict(aa=a, bb=b), using=MockOutputter4)
369
+
370
+ a = dag.create(mock_creator2, params=dict(p=2))
371
+ b = dag.create(mock_creator2, params=dict(p=2))
372
+ c = dag.process(a, b, using=mock_processor4)
373
+ c.assert_eq(ArrayDataFrame([[2]], "a:int"))
374
+ dag.output(a, b, using=mock_outputter4)
368
375
  dag.run(self.engine)
369
376
 
370
377
  def test_zip(self):
@@ -435,9 +442,14 @@ class BuiltInTests(object):
435
442
  # this test is important for using mapInPandas in spark
436
443
 
437
444
  # schema: *,c:int
438
- def mt_pandas(dfs: Iterable[pd.DataFrame]) -> Iterator[pd.DataFrame]:
445
+ def mt_pandas(
446
+ dfs: Iterable[pd.DataFrame], empty: bool = False
447
+ ) -> Iterator[pd.DataFrame]:
439
448
  for df in dfs:
440
- yield df.assign(c=2)
449
+ if not empty:
450
+ df = df.assign(c=2)
451
+ df = df[reversed(list(df.columns))]
452
+ yield df
441
453
 
442
454
  with FugueWorkflow() as dag:
443
455
  a = dag.df([[1, 2], [3, 4]], "a:int,b:int")
@@ -445,10 +457,25 @@ class BuiltInTests(object):
445
457
  dag.df([[1, 2, 2], [3, 4, 2]], "a:int,b:int,c:int").assert_eq(b)
446
458
  dag.run(self.engine)
447
459
 
460
+ # when iterable returns nothing
461
+ with FugueWorkflow() as dag:
462
+ a = dag.df([[1, 2], [3, 4]], "a:int,b:int")
463
+ # without partitioning
464
+ b = a.transform(mt_pandas, params=dict(empty=True))
465
+ dag.df([], "a:int,b:int,c:int").assert_eq(b)
466
+ # with partitioning
467
+ b = a.partition_by("a").transform(mt_pandas, params=dict(empty=True))
468
+ dag.df([], "a:int,b:int,c:int").assert_eq(b)
469
+ dag.run(self.engine)
470
+
448
471
  # schema: *
449
- def mt_arrow(dfs: Iterable[pa.Table]) -> Iterator[pa.Table]:
472
+ def mt_arrow(
473
+ dfs: Iterable[pa.Table], empty: bool = False
474
+ ) -> Iterator[pa.Table]:
450
475
  for df in dfs:
451
- yield df
476
+ if not empty:
477
+ df = df.select(reversed(df.schema.names))
478
+ yield df
452
479
 
453
480
  # schema: a:long
454
481
  def mt_arrow_2(dfs: Iterable[pa.Table]) -> Iterator[pa.Table]:
@@ -463,6 +490,17 @@ class BuiltInTests(object):
463
490
  dag.df([[1], [3]], "a:long").assert_eq(b)
464
491
  dag.run(self.engine)
465
492
 
493
+ # when iterable returns nothing
494
+ with FugueWorkflow() as dag:
495
+ a = dag.df([[1, 2], [3, 4]], "a:int,b:int")
496
+ # without partitioning
497
+ b = a.transform(mt_arrow, params=dict(empty=True))
498
+ dag.df([], "a:int,b:int").assert_eq(b)
499
+ # with partitioning
500
+ b = a.partition_by("a").transform(mt_arrow, params=dict(empty=True))
501
+ dag.df([], "a:int,b:int").assert_eq(b)
502
+ dag.run(self.engine)
503
+
466
504
  def test_transform_binary(self):
467
505
  with FugueWorkflow() as dag:
468
506
  a = dag.df([[1, pickle.dumps([0, "a"])]], "a:int,b:bytes")
@@ -1829,6 +1867,10 @@ def mock_creator(p: int) -> DataFrame:
1829
1867
  return ArrayDataFrame([[p]], "a:int")
1830
1868
 
1831
1869
 
1870
+ def mock_creator2(p: int) -> AnyDataFrame:
1871
+ return fa.as_fugue_df([[p]], schema="a:int")
1872
+
1873
+
1832
1874
  def mock_processor(df1: List[List[Any]], df2: List[List[Any]]) -> DataFrame:
1833
1875
  return ArrayDataFrame([[len(df1) + len(df2)]], "a:int")
1834
1876
 
@@ -1844,6 +1886,10 @@ class MockProcessor3(Processor):
1844
1886
  return ArrayDataFrame([[sum(s.count() for s in dfs.values())]], "a:int")
1845
1887
 
1846
1888
 
1889
+ def mock_processor4(df1: AnyDataFrame, df2: AnyDataFrame) -> AnyDataFrame:
1890
+ return ArrayDataFrame([[fa.count(df1) + fa.count(df2)]], "a:int")
1891
+
1892
+
1847
1893
  def mock_outputter(df1: List[List[Any]], df2: List[List[Any]]) -> None:
1848
1894
  assert len(df1) == len(df2)
1849
1895
 
@@ -1857,6 +1903,10 @@ class MockOutputter3(Outputter):
1857
1903
  assert "3" == self.partition_spec.num_partitions
1858
1904
 
1859
1905
 
1906
+ def mock_outputter4(df1: AnyDataFrame, df2: AnyDataFrame) -> None:
1907
+ assert fa.count(df1) == fa.count(df2)
1908
+
1909
+
1860
1910
  class MockOutputter4(Outputter):
1861
1911
  def process(self, dfs):
1862
1912
  for k, v in dfs.items():
@@ -1895,8 +1945,8 @@ def mock_tf0(df: pd.DataFrame, p=1, col="p") -> pd.DataFrame:
1895
1945
 
1896
1946
  # schema: *,ct:int,p:int
1897
1947
  def mock_tf1(df: pd.DataFrame, p=1) -> pd.DataFrame:
1898
- df["ct"] = df.shape[0]
1899
1948
  df["p"] = p
1949
+ df["ct"] = df.shape[0]
1900
1950
  return df
1901
1951
 
1902
1952