touchtext 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- touchtext-0.0.1/PKG-INFO +16 -0
- touchtext-0.0.1/setup.cfg +4 -0
- touchtext-0.0.1/setup_touchtext.py +31 -0
- touchtext-0.0.1/touchtext/__init__.py +7 -0
- touchtext-0.0.1/touchtext/functional.py +138 -0
- touchtext-0.0.1/touchtext/wget.py +402 -0
- touchtext-0.0.1/touchtext.egg-info/PKG-INFO +16 -0
- touchtext-0.0.1/touchtext.egg-info/SOURCES.txt +9 -0
- touchtext-0.0.1/touchtext.egg-info/dependency_links.txt +1 -0
- touchtext-0.0.1/touchtext.egg-info/requires.txt +2 -0
- touchtext-0.0.1/touchtext.egg-info/top_level.txt +1 -0
touchtext-0.0.1/PKG-INFO
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: touchtext
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: Markup Markdown, Stack up markdown files with `!INCLUDE` directives.
|
|
5
|
+
Home-page: https://github.com/hailiang-wang/transformer-pytorch-get-started/tree/master/src/touchtext
|
|
6
|
+
Author: Torchtext Team, Hai Liang W.
|
|
7
|
+
Author-email: hailiang.hl.wang@gmail.com
|
|
8
|
+
License: MIT License
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Classifier: Programming Language :: Python
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.7
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Topic :: Utilities
|
|
16
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
|
|
4
|
+
from setuptools import setup
|
|
5
|
+
|
|
6
|
+
setup(
|
|
7
|
+
name='touchtext',
|
|
8
|
+
description='Markup Markdown, Stack up markdown files with `!INCLUDE` directives.',
|
|
9
|
+
version='0.0.1',
|
|
10
|
+
author='Torchtext Team, Hai Liang W.',
|
|
11
|
+
author_email='hailiang.hl.wang@gmail.com',
|
|
12
|
+
url='https://github.com/hailiang-wang/transformer-pytorch-get-started/tree/master/src/touchtext',
|
|
13
|
+
classifiers=[
|
|
14
|
+
'License :: OSI Approved :: MIT License',
|
|
15
|
+
'Operating System :: OS Independent',
|
|
16
|
+
'Programming Language :: Python',
|
|
17
|
+
'Programming Language :: Python :: 3',
|
|
18
|
+
'Programming Language :: Python :: 3.7',
|
|
19
|
+
'Programming Language :: Python :: 3.11',
|
|
20
|
+
'Topic :: Utilities',
|
|
21
|
+
'Development Status :: 5 - Production/Stable',
|
|
22
|
+
],
|
|
23
|
+
license='MIT License',
|
|
24
|
+
packages=['touchtext'],
|
|
25
|
+
entry_points={
|
|
26
|
+
},
|
|
27
|
+
install_requires=[
|
|
28
|
+
'torch >= 2.3.1',
|
|
29
|
+
'tqdm',
|
|
30
|
+
],
|
|
31
|
+
)
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
from typing import Any, List, Optional
|
|
2
|
+
|
|
3
|
+
import torch
|
|
4
|
+
from torch import Tensor
|
|
5
|
+
from torch.nn.utils.rnn import pad_sequence
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"to_tensor",
|
|
9
|
+
"truncate",
|
|
10
|
+
"add_token",
|
|
11
|
+
"str_to_int",
|
|
12
|
+
]
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def to_tensor(input: Any, padding_value: Optional[int] = None, dtype: torch.dtype = torch.long) -> Tensor:
|
|
16
|
+
r"""Convert input to torch tensor
|
|
17
|
+
|
|
18
|
+
:param padding_value: Pad value to make each input in the batch of length equal to the longest sequence in the batch.
|
|
19
|
+
:type padding_value: Optional[int]
|
|
20
|
+
:param dtype: :class:`torch.dtype` of output tensor
|
|
21
|
+
:type dtype: :class:`torch.dtype`
|
|
22
|
+
:param input: Sequence or batch of token ids
|
|
23
|
+
:type input: Union[List[int], List[List[int]]]
|
|
24
|
+
:rtype: Tensor
|
|
25
|
+
"""
|
|
26
|
+
if torch.jit.isinstance(input, List[int]):
|
|
27
|
+
return torch.tensor(input, dtype=torch.long)
|
|
28
|
+
elif torch.jit.isinstance(input, List[List[int]]):
|
|
29
|
+
if padding_value is None:
|
|
30
|
+
output = torch.tensor(input, dtype=dtype)
|
|
31
|
+
return output
|
|
32
|
+
else:
|
|
33
|
+
output = pad_sequence(
|
|
34
|
+
[torch.tensor(ids, dtype=dtype) for ids in input], batch_first=True, padding_value=float(padding_value)
|
|
35
|
+
)
|
|
36
|
+
return output
|
|
37
|
+
else:
|
|
38
|
+
raise TypeError("Input type not supported")
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def truncate(input: Any, max_seq_len: int) -> Any:
|
|
42
|
+
"""Truncate input sequence or batch
|
|
43
|
+
|
|
44
|
+
:param input: Input sequence or batch to be truncated
|
|
45
|
+
:type input: Union[List[Union[str, int]], List[List[Union[str, int]]]]
|
|
46
|
+
:param max_seq_len: Maximum length beyond which input is discarded
|
|
47
|
+
:type max_seq_len: int
|
|
48
|
+
:return: Truncated sequence
|
|
49
|
+
:rtype: Union[List[Union[str, int]], List[List[Union[str, int]]]]
|
|
50
|
+
"""
|
|
51
|
+
if torch.jit.isinstance(input, List[int]):
|
|
52
|
+
return input[:max_seq_len]
|
|
53
|
+
elif torch.jit.isinstance(input, List[str]):
|
|
54
|
+
return input[:max_seq_len]
|
|
55
|
+
elif torch.jit.isinstance(input, List[List[int]]):
|
|
56
|
+
output: List[List[int]] = []
|
|
57
|
+
for ids in input:
|
|
58
|
+
output.append(ids[:max_seq_len])
|
|
59
|
+
return output
|
|
60
|
+
elif torch.jit.isinstance(input, List[List[str]]):
|
|
61
|
+
output: List[List[str]] = []
|
|
62
|
+
for ids in input:
|
|
63
|
+
output.append(ids[:max_seq_len])
|
|
64
|
+
return output
|
|
65
|
+
else:
|
|
66
|
+
raise TypeError("Input type not supported")
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def add_token(input: Any, token_id: Any, begin: bool = True) -> Any:
|
|
70
|
+
"""Add token to start or end of sequence
|
|
71
|
+
|
|
72
|
+
:param input: Input sequence or batch
|
|
73
|
+
:type input: Union[List[Union[str, int]], List[List[Union[str, int]]]]
|
|
74
|
+
:param token_id: token to be added
|
|
75
|
+
:type token_id: Union[str, int]
|
|
76
|
+
:param begin: Whether to insert token at start or end or sequence, defaults to True
|
|
77
|
+
:type begin: bool, optional
|
|
78
|
+
:return: sequence or batch with token_id added to begin or end or input
|
|
79
|
+
:rtype: Union[List[Union[str, int]], List[List[Union[str, int]]]]
|
|
80
|
+
"""
|
|
81
|
+
if torch.jit.isinstance(input, List[int]) and torch.jit.isinstance(token_id, int):
|
|
82
|
+
if begin:
|
|
83
|
+
return [token_id] + input
|
|
84
|
+
else:
|
|
85
|
+
return input + [token_id]
|
|
86
|
+
elif torch.jit.isinstance(input, List[str]) and torch.jit.isinstance(token_id, str):
|
|
87
|
+
if begin:
|
|
88
|
+
return [token_id] + input
|
|
89
|
+
else:
|
|
90
|
+
return input + [token_id]
|
|
91
|
+
elif torch.jit.isinstance(input, List[List[int]]) and torch.jit.isinstance(token_id, int):
|
|
92
|
+
output: List[List[int]] = []
|
|
93
|
+
|
|
94
|
+
if begin:
|
|
95
|
+
for ids in input:
|
|
96
|
+
output.append([token_id] + ids)
|
|
97
|
+
else:
|
|
98
|
+
for ids in input:
|
|
99
|
+
output.append(ids + [token_id])
|
|
100
|
+
|
|
101
|
+
return output
|
|
102
|
+
elif torch.jit.isinstance(input, List[List[str]]) and torch.jit.isinstance(token_id, str):
|
|
103
|
+
output: List[List[str]] = []
|
|
104
|
+
if begin:
|
|
105
|
+
for ids in input:
|
|
106
|
+
output.append([token_id] + ids)
|
|
107
|
+
else:
|
|
108
|
+
for ids in input:
|
|
109
|
+
output.append(ids + [token_id])
|
|
110
|
+
|
|
111
|
+
return output
|
|
112
|
+
else:
|
|
113
|
+
raise TypeError("Input type not supported")
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def str_to_int(input: Any) -> Any:
|
|
117
|
+
"""Convert string tokens to integers (either single sequence or batch).
|
|
118
|
+
|
|
119
|
+
:param input: Input sequence or batch
|
|
120
|
+
:type input: Union[List[str], List[List[str]]]
|
|
121
|
+
:return: Sequence or batch of string tokens converted to integers
|
|
122
|
+
:rtype: Union[List[int], List[List[int]]]
|
|
123
|
+
"""
|
|
124
|
+
if torch.jit.isinstance(input, List[str]):
|
|
125
|
+
output: List[int] = []
|
|
126
|
+
for element in input:
|
|
127
|
+
output.append(int(element))
|
|
128
|
+
return output
|
|
129
|
+
if torch.jit.isinstance(input, List[List[str]]):
|
|
130
|
+
output: List[List[int]] = []
|
|
131
|
+
for ids in input:
|
|
132
|
+
current: List[int] = []
|
|
133
|
+
for element in ids:
|
|
134
|
+
current.append(int(element))
|
|
135
|
+
output.append(current)
|
|
136
|
+
return output
|
|
137
|
+
else:
|
|
138
|
+
raise TypeError("Input type not supported")
|
|
@@ -0,0 +1,402 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
"""
|
|
3
|
+
Download utility as an easy way to get file from the net
|
|
4
|
+
|
|
5
|
+
python -m wget <URL>
|
|
6
|
+
python wget.py <URL>
|
|
7
|
+
|
|
8
|
+
Downloads: http://pypi.python.org/pypi/wget/
|
|
9
|
+
Development: http://bitbucket.org/techtonik/python-wget/
|
|
10
|
+
|
|
11
|
+
wget.py is not option compatible with Unix wget utility,
|
|
12
|
+
to make command line interface intuitive for new people.
|
|
13
|
+
|
|
14
|
+
Public domain by anatoly techtonik <techtonik@gmail.com>
|
|
15
|
+
Also available under the terms of MIT license
|
|
16
|
+
Copyright (c) 2010-2014 anatoly techtonik
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
import sys, shutil, os
|
|
21
|
+
import tempfile
|
|
22
|
+
import math
|
|
23
|
+
|
|
24
|
+
PY3K = sys.version_info >= (3, 0)
|
|
25
|
+
if PY3K:
|
|
26
|
+
import urllib.request as urllib
|
|
27
|
+
import urllib.parse as urlparse
|
|
28
|
+
else:
|
|
29
|
+
import urllib
|
|
30
|
+
import urlparse
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
__version__ = "2.3-beta1"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def filename_from_url(url):
|
|
37
|
+
""":return: detected filename or None"""
|
|
38
|
+
fname = os.path.basename(urlparse.urlparse(url).path)
|
|
39
|
+
if len(fname.strip(" \n\t.")) == 0:
|
|
40
|
+
return None
|
|
41
|
+
return fname
|
|
42
|
+
|
|
43
|
+
def filename_from_headers(headers):
|
|
44
|
+
"""Detect filename from Content-Disposition headers if present.
|
|
45
|
+
http://greenbytes.de/tech/tc2231/
|
|
46
|
+
|
|
47
|
+
:param: headers as dict, list or string
|
|
48
|
+
:return: filename from content-disposition header or None
|
|
49
|
+
"""
|
|
50
|
+
if type(headers) == str:
|
|
51
|
+
headers = headers.splitlines()
|
|
52
|
+
if type(headers) == list:
|
|
53
|
+
headers = dict([x.split(':', 1) for x in headers])
|
|
54
|
+
cdisp = headers.get("Content-Disposition")
|
|
55
|
+
if not cdisp:
|
|
56
|
+
return None
|
|
57
|
+
cdtype = cdisp.split(';')
|
|
58
|
+
if len(cdtype) == 1:
|
|
59
|
+
return None
|
|
60
|
+
if cdtype[0].strip().lower() not in ('inline', 'attachment'):
|
|
61
|
+
return None
|
|
62
|
+
# several filename params is illegal, but just in case
|
|
63
|
+
fnames = [x for x in cdtype[1:] if x.strip().startswith('filename=')]
|
|
64
|
+
if len(fnames) > 1:
|
|
65
|
+
return None
|
|
66
|
+
name = fnames[0].split('=')[1].strip(' \t"')
|
|
67
|
+
name = os.path.basename(name)
|
|
68
|
+
if not name:
|
|
69
|
+
return None
|
|
70
|
+
return name
|
|
71
|
+
|
|
72
|
+
def filename_fix_existing(filename):
|
|
73
|
+
"""Expands name portion of filename with numeric ' (x)' suffix to
|
|
74
|
+
return filename that doesn't exist already.
|
|
75
|
+
"""
|
|
76
|
+
dirname = '.'
|
|
77
|
+
name, ext = filename.rsplit('.', 1)
|
|
78
|
+
names = [x for x in os.listdir(dirname) if x.startswith(name)]
|
|
79
|
+
names = [x.rsplit('.', 1)[0] for x in names]
|
|
80
|
+
suffixes = [x.replace(name, '') for x in names]
|
|
81
|
+
# filter suffixes that match ' (x)' pattern
|
|
82
|
+
suffixes = [x[2:-1] for x in suffixes
|
|
83
|
+
if x.startswith(' (') and x.endswith(')')]
|
|
84
|
+
indexes = [int(x) for x in suffixes
|
|
85
|
+
if set(x) <= set('0123456789')]
|
|
86
|
+
idx = 1
|
|
87
|
+
if indexes:
|
|
88
|
+
idx += sorted(indexes)[-1]
|
|
89
|
+
return '%s (%d).%s' % (name, idx, ext)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
# --- terminal/console output helpers ---
|
|
93
|
+
|
|
94
|
+
def get_console_width():
|
|
95
|
+
"""Return width of available window area. Autodetection works for
|
|
96
|
+
Windows and POSIX platforms. Returns 80 for others
|
|
97
|
+
|
|
98
|
+
Code from http://bitbucket.org/techtonik/python-pager
|
|
99
|
+
"""
|
|
100
|
+
|
|
101
|
+
if os.name == 'nt':
|
|
102
|
+
STD_INPUT_HANDLE = -10
|
|
103
|
+
STD_OUTPUT_HANDLE = -11
|
|
104
|
+
STD_ERROR_HANDLE = -12
|
|
105
|
+
|
|
106
|
+
# get console handle
|
|
107
|
+
from ctypes import windll, Structure, byref
|
|
108
|
+
try:
|
|
109
|
+
from ctypes.wintypes import SHORT, WORD, DWORD
|
|
110
|
+
except ImportError:
|
|
111
|
+
# workaround for missing types in Python 2.5
|
|
112
|
+
from ctypes import (
|
|
113
|
+
c_short as SHORT, c_ushort as WORD, c_ulong as DWORD)
|
|
114
|
+
console_handle = windll.kernel32.GetStdHandle(STD_OUTPUT_HANDLE)
|
|
115
|
+
|
|
116
|
+
# CONSOLE_SCREEN_BUFFER_INFO Structure
|
|
117
|
+
class COORD(Structure):
|
|
118
|
+
_fields_ = [("X", SHORT), ("Y", SHORT)]
|
|
119
|
+
|
|
120
|
+
class SMALL_RECT(Structure):
|
|
121
|
+
_fields_ = [("Left", SHORT), ("Top", SHORT),
|
|
122
|
+
("Right", SHORT), ("Bottom", SHORT)]
|
|
123
|
+
|
|
124
|
+
class CONSOLE_SCREEN_BUFFER_INFO(Structure):
|
|
125
|
+
_fields_ = [("dwSize", COORD),
|
|
126
|
+
("dwCursorPosition", COORD),
|
|
127
|
+
("wAttributes", WORD),
|
|
128
|
+
("srWindow", SMALL_RECT),
|
|
129
|
+
("dwMaximumWindowSize", DWORD)]
|
|
130
|
+
|
|
131
|
+
sbi = CONSOLE_SCREEN_BUFFER_INFO()
|
|
132
|
+
ret = windll.kernel32.GetConsoleScreenBufferInfo(console_handle, byref(sbi))
|
|
133
|
+
if ret == 0:
|
|
134
|
+
return 0
|
|
135
|
+
return sbi.srWindow.Right+1
|
|
136
|
+
|
|
137
|
+
elif os.name == 'posix':
|
|
138
|
+
from fcntl import ioctl
|
|
139
|
+
from termios import TIOCGWINSZ
|
|
140
|
+
from array import array
|
|
141
|
+
|
|
142
|
+
winsize = array("H", [0] * 4)
|
|
143
|
+
try:
|
|
144
|
+
ioctl(sys.stdout.fileno(), TIOCGWINSZ, winsize)
|
|
145
|
+
except IOError:
|
|
146
|
+
pass
|
|
147
|
+
return (winsize[1], winsize[0])[0]
|
|
148
|
+
|
|
149
|
+
return 80
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def bar_thermometer(current, total, width=80):
|
|
153
|
+
"""Return thermometer style progress bar string. `total` argument
|
|
154
|
+
can not be zero. The minimum size of bar returned is 3. Example:
|
|
155
|
+
|
|
156
|
+
[.......... ]
|
|
157
|
+
|
|
158
|
+
Control and trailing symbols (\r and spaces) are not included.
|
|
159
|
+
See `bar_adaptive` for more information.
|
|
160
|
+
"""
|
|
161
|
+
# number of dots on thermometer scale
|
|
162
|
+
avail_dots = width-2
|
|
163
|
+
shaded_dots = int(math.floor(float(current) / total * avail_dots))
|
|
164
|
+
return '[' + '.'*shaded_dots + ' '*(avail_dots-shaded_dots) + ']'
|
|
165
|
+
|
|
166
|
+
def bar_adaptive(current, total, width=80):
|
|
167
|
+
"""Return progress bar string for given values in one of three
|
|
168
|
+
styles depending on available width:
|
|
169
|
+
|
|
170
|
+
[.. ] downloaded / total
|
|
171
|
+
downloaded / total
|
|
172
|
+
[.. ]
|
|
173
|
+
|
|
174
|
+
if total value is unknown or <= 0, show bytes counter using two
|
|
175
|
+
adaptive styles:
|
|
176
|
+
|
|
177
|
+
%s / unknown
|
|
178
|
+
%s
|
|
179
|
+
|
|
180
|
+
if there is not enough space on the screen, do not display anything
|
|
181
|
+
|
|
182
|
+
returned string doesn't include control characters like \r used to
|
|
183
|
+
place cursor at the beginning of the line to erase previous content.
|
|
184
|
+
|
|
185
|
+
this function leaves one free character at the end of string to
|
|
186
|
+
avoid automatic linefeed on Windows.
|
|
187
|
+
"""
|
|
188
|
+
|
|
189
|
+
# process special case when total size is unknown and return immediately
|
|
190
|
+
if not total or total < 0:
|
|
191
|
+
msg = "%s / unknown" % current
|
|
192
|
+
if len(msg) < width: # leaves one character to avoid linefeed
|
|
193
|
+
return msg
|
|
194
|
+
if len("%s" % current) < width:
|
|
195
|
+
return "%s" % current
|
|
196
|
+
|
|
197
|
+
# --- adaptive layout algorithm ---
|
|
198
|
+
#
|
|
199
|
+
# [x] describe the format of the progress bar
|
|
200
|
+
# [x] describe min width for each data field
|
|
201
|
+
# [x] set priorities for each element
|
|
202
|
+
# [x] select elements to be shown
|
|
203
|
+
# [x] choose top priority element min_width < avail_width
|
|
204
|
+
# [x] lessen avail_width by value if min_width
|
|
205
|
+
# [x] exclude element from priority list and repeat
|
|
206
|
+
|
|
207
|
+
# 10% [.. ] 10/100
|
|
208
|
+
# pppp bbbbb sssssss
|
|
209
|
+
|
|
210
|
+
min_width = {
|
|
211
|
+
'percent': 4, # 100%
|
|
212
|
+
'bar': 3, # [.]
|
|
213
|
+
'size': len("%s" % total)*2 + 3, # 'xxxx / yyyy'
|
|
214
|
+
}
|
|
215
|
+
priority = ['percent', 'bar', 'size']
|
|
216
|
+
|
|
217
|
+
# select elements to show
|
|
218
|
+
selected = []
|
|
219
|
+
avail = width
|
|
220
|
+
for field in priority:
|
|
221
|
+
if min_width[field] < avail:
|
|
222
|
+
selected.append(field)
|
|
223
|
+
avail -= min_width[field]+1 # +1 is for separator or for reserved space at
|
|
224
|
+
# the end of line to avoid linefeed on Windows
|
|
225
|
+
# render
|
|
226
|
+
output = ''
|
|
227
|
+
for field in selected:
|
|
228
|
+
|
|
229
|
+
if field == 'percent':
|
|
230
|
+
# fixed size width for percentage
|
|
231
|
+
output += ('%s%%' % (100 * current // total)).rjust(min_width['percent'])
|
|
232
|
+
elif field == 'bar': # [. ]
|
|
233
|
+
# bar takes its min width + all available space
|
|
234
|
+
output += bar_thermometer(current, total, min_width['bar']+avail)
|
|
235
|
+
elif field == 'size':
|
|
236
|
+
# size field has a constant width (min == max)
|
|
237
|
+
output += ("%s / %s" % (current, total)).rjust(min_width['size'])
|
|
238
|
+
|
|
239
|
+
selected = selected[1:]
|
|
240
|
+
if selected:
|
|
241
|
+
output += ' ' # add field separator
|
|
242
|
+
|
|
243
|
+
return output
|
|
244
|
+
|
|
245
|
+
# --/ console helpers
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
__current_size = 0 # global state variable, which exists solely as a
|
|
249
|
+
# workaround against Python 3.3.0 regression
|
|
250
|
+
# http://bugs.python.org/issue16409
|
|
251
|
+
# fixed in Python 3.3.1
|
|
252
|
+
def callback_progress(blocks, block_size, total_size, bar_function):
|
|
253
|
+
"""callback function for urlretrieve that is called when connection is
|
|
254
|
+
created and when once for each block
|
|
255
|
+
|
|
256
|
+
draws adaptive progress bar in terminal/console
|
|
257
|
+
|
|
258
|
+
use sys.stdout.write() instead of "print,", because it allows one more
|
|
259
|
+
symbol at the line end without linefeed on Windows
|
|
260
|
+
|
|
261
|
+
:param blocks: number of blocks transferred so far
|
|
262
|
+
:param block_size: in bytes
|
|
263
|
+
:param total_size: in bytes, can be -1 if server doesn't return it
|
|
264
|
+
:param bar_function: another callback function to visualize progress
|
|
265
|
+
"""
|
|
266
|
+
global __current_size
|
|
267
|
+
|
|
268
|
+
width = min(100, get_console_width())
|
|
269
|
+
|
|
270
|
+
if sys.version_info[:3] == (3, 3, 0): # regression workaround
|
|
271
|
+
if blocks == 0: # first call
|
|
272
|
+
__current_size = 0
|
|
273
|
+
else:
|
|
274
|
+
__current_size += block_size
|
|
275
|
+
current_size = __current_size
|
|
276
|
+
else:
|
|
277
|
+
current_size = min(blocks*block_size, total_size)
|
|
278
|
+
progress = bar_function(current_size, total_size, width)
|
|
279
|
+
if progress:
|
|
280
|
+
sys.stdout.write("\r" + progress)
|
|
281
|
+
|
|
282
|
+
class ThrowOnErrorOpener(urllib.FancyURLopener):
|
|
283
|
+
def http_error_default(self, url, fp, errcode, errmsg, headers):
|
|
284
|
+
raise Exception("%s: %s" % (errcode, errmsg))
|
|
285
|
+
|
|
286
|
+
def download(url, out=None, bar=bar_adaptive):
|
|
287
|
+
"""High level function, which downloads URL into tmp file in current
|
|
288
|
+
directory and then renames it to filename autodetected from either URL
|
|
289
|
+
or HTTP headers.
|
|
290
|
+
|
|
291
|
+
:param bar: function to track download progress (visualize etc.)
|
|
292
|
+
:param out: output filename or directory
|
|
293
|
+
:return: filename where URL is downloaded to
|
|
294
|
+
"""
|
|
295
|
+
names = dict()
|
|
296
|
+
names["out"] = out or ''
|
|
297
|
+
names["url"] = filename_from_url(url)
|
|
298
|
+
# get filename for temp file in current directory
|
|
299
|
+
prefix = (names["url"] or names["out"] or ".") + "."
|
|
300
|
+
(fd, tmpfile) = tempfile.mkstemp(".tmp", prefix=prefix, dir=".")
|
|
301
|
+
os.close(fd)
|
|
302
|
+
os.unlink(tmpfile)
|
|
303
|
+
|
|
304
|
+
# set progress monitoring callback
|
|
305
|
+
def callback_charged(blocks, block_size, total_size):
|
|
306
|
+
# 'closure' to set bar drawing function in callback
|
|
307
|
+
callback_progress(blocks, block_size, total_size, bar_function=bar)
|
|
308
|
+
if bar:
|
|
309
|
+
callback = callback_charged
|
|
310
|
+
else:
|
|
311
|
+
callback = None
|
|
312
|
+
|
|
313
|
+
(tmpfile, headers) = ThrowOnErrorOpener().retrieve(url, tmpfile, callback)
|
|
314
|
+
names["header"] = filename_from_headers(headers)
|
|
315
|
+
if os.path.isdir(names["out"]):
|
|
316
|
+
filename = names["header"] or names["url"]
|
|
317
|
+
filename = names["out"] + "/" + filename
|
|
318
|
+
else:
|
|
319
|
+
filename = names["out"] or names["header"] or names["url"]
|
|
320
|
+
# add numeric ' (x)' suffix if filename already exists
|
|
321
|
+
if os.path.exists(filename):
|
|
322
|
+
filename = filename_fix_existing(filename)
|
|
323
|
+
shutil.move(tmpfile, filename)
|
|
324
|
+
|
|
325
|
+
#print headers
|
|
326
|
+
return filename
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
usage = """\
|
|
330
|
+
usage: wget.py [options] URL
|
|
331
|
+
|
|
332
|
+
options:
|
|
333
|
+
-o --output FILE|DIR output filename or directory
|
|
334
|
+
-h --help
|
|
335
|
+
--version
|
|
336
|
+
"""
|
|
337
|
+
|
|
338
|
+
if __name__ == "__main__":
|
|
339
|
+
if len(sys.argv) < 2 or "-h" in sys.argv or "--help" in sys.argv:
|
|
340
|
+
sys.exit(usage)
|
|
341
|
+
if "--version" in sys.argv:
|
|
342
|
+
sys.exit("wget.py " + __version__)
|
|
343
|
+
|
|
344
|
+
from optparse import OptionParser
|
|
345
|
+
parser = OptionParser()
|
|
346
|
+
parser.add_option("-o", "--output", dest="output")
|
|
347
|
+
(options, args) = parser.parse_args()
|
|
348
|
+
|
|
349
|
+
url = sys.argv[1]
|
|
350
|
+
filename = download(args[0], out=options.output)
|
|
351
|
+
|
|
352
|
+
print("")
|
|
353
|
+
print("Saved under %s" % filename)
|
|
354
|
+
|
|
355
|
+
r"""
|
|
356
|
+
features that require more tuits for urlretrieve API
|
|
357
|
+
http://www.python.org/doc/2.6/library/urllib.html#urllib.urlretrieve
|
|
358
|
+
|
|
359
|
+
[x] autodetect filename from URL
|
|
360
|
+
[x] autodetect filename from headers - Content-Disposition
|
|
361
|
+
http://greenbytes.de/tech/tc2231/
|
|
362
|
+
[ ] make HEAD request to detect temp filename from Content-Disposition
|
|
363
|
+
[ ] process HTTP status codes (i.e. 404 error)
|
|
364
|
+
http://ftp.de.debian.org/debian/pool/iso-codes_3.24.2.orig.tar.bz2
|
|
365
|
+
[ ] catch KeyboardInterrupt
|
|
366
|
+
[ ] optionally preserve incomplete file
|
|
367
|
+
[x] create temp file in current directory
|
|
368
|
+
[ ] resume download (broken connection)
|
|
369
|
+
[ ] resume download (incomplete file)
|
|
370
|
+
[x] show progress indicator
|
|
371
|
+
http://mail.python.org/pipermail/tutor/2005-May/038797.html
|
|
372
|
+
[x] do not overwrite downloaded file
|
|
373
|
+
[x] rename file automatically if exists
|
|
374
|
+
[x] optionally specify path for downloaded file
|
|
375
|
+
|
|
376
|
+
[ ] options plan
|
|
377
|
+
[x] -h, --help, --version (CHAOS speccy)
|
|
378
|
+
[ ] clpbar progress bar style
|
|
379
|
+
_ 30.0Mb at 3.0 Mbps eta: 0:00:20 30% [===== ]
|
|
380
|
+
[ ] test "bar \r" print with \r at the end of line on Windows
|
|
381
|
+
[ ] process Python 2.x urllib.ContentTooShortError exception gracefully
|
|
382
|
+
(ideally retry and continue download)
|
|
383
|
+
|
|
384
|
+
(tmpfile, headers) = urllib.urlretrieve(url, tmpfile, callback_progress)
|
|
385
|
+
File "C:\Python27\lib\urllib.py", line 93, in urlretrieve
|
|
386
|
+
return _urlopener.retrieve(url, filename, reporthook, data)
|
|
387
|
+
File "C:\Python27\lib\urllib.py", line 283, in retrieve
|
|
388
|
+
"of %i bytes" % (read, size), result)
|
|
389
|
+
urllib.ContentTooShortError: retrieval incomplete: got only 15239952 out of 24807571 bytes
|
|
390
|
+
|
|
391
|
+
[ ] find out if urlretrieve may return unicode headers
|
|
392
|
+
[ ] test suite for unsafe filenames from url and from headers
|
|
393
|
+
|
|
394
|
+
[ ] security checks
|
|
395
|
+
[ ] filename_from_url
|
|
396
|
+
[ ] filename_from_headers
|
|
397
|
+
[ ] MITM redirect from https URL
|
|
398
|
+
[ ] https certificate check
|
|
399
|
+
[ ] size+hash check helpers
|
|
400
|
+
[ ] fail if size is known and mismatch
|
|
401
|
+
[ ] fail if hash mismatch
|
|
402
|
+
"""
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: touchtext
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: Markup Markdown, Stack up markdown files with `!INCLUDE` directives.
|
|
5
|
+
Home-page: https://github.com/hailiang-wang/transformer-pytorch-get-started/tree/master/src/touchtext
|
|
6
|
+
Author: Torchtext Team, Hai Liang W.
|
|
7
|
+
Author-email: hailiang.hl.wang@gmail.com
|
|
8
|
+
License: MIT License
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Classifier: Programming Language :: Python
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.7
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Topic :: Utilities
|
|
16
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
touchtext
|