UrlCategorise 0.1.6 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,297 @@
1
+ # Video URL Detection Patterns
2
+ # Generated on: 2025-08-27 14:46:56 UTC
3
+ # Source: yt-dlp extractors (https://github.com/yt-dlp/yt-dlp) + manual patterns
4
+ # Purpose: Regex patterns to detect video URLs vs other content
5
+ #
6
+ # These patterns help distinguish between:
7
+ # - Direct video content URLs
8
+ # - Homepage, playlist, user profile, community content URLs
9
+ #
10
+ # Usage: Use these patterns to categorize URLs from video hosting domains
11
+ # to determine if they contain actual video content or other resources
12
+ #
13
+
14
+ # ===== MANUAL HIGH-PRIORITY PATTERNS =====
15
+
16
+ # Source: manual_youtube
17
+ # Description: YouTube video watch URLs
18
+ # Pattern: https?://(?:www\.)?(?:youtube\.com/watch\?v=|youtu\.be/)[a-zA-Z0-9_-]{11}
19
+ https?://(?:www\.)?(?:youtube\.com/watch\?v=|youtu\.be/)[a-zA-Z0-9_-]{11}
20
+
21
+ # Source: manual_youtube_shorts
22
+ # Description: YouTube Shorts URLs
23
+ # Pattern: https?://(?:www\.)?youtube\.com/shorts/[a-zA-Z0-9_-]{11}
24
+ https?://(?:www\.)?youtube\.com/shorts/[a-zA-Z0-9_-]{11}
25
+
26
+ # Source: manual_vimeo
27
+ # Description: Vimeo video URLs
28
+ # Pattern: https?://(?:www\.)?vimeo\.com/\d+
29
+ https?://(?:www\.)?vimeo\.com/\d+
30
+
31
+ # Source: manual_dailymotion
32
+ # Description: Dailymotion video URLs
33
+ # Pattern: https?://(?:www\.)?dailymotion\.com/video/[a-zA-Z0-9]+
34
+ https?://(?:www\.)?dailymotion\.com/video/[a-zA-Z0-9]+
35
+
36
+ # Source: manual_twitch_videos
37
+ # Description: Twitch video URLs
38
+ # Pattern: https?://(?:www\.)?twitch\.tv/videos/\d+
39
+ https?://(?:www\.)?twitch\.tv/videos/\d+
40
+
41
+ # Source: manual_tiktok
42
+ # Description: TikTok video URLs
43
+ # Pattern: https?://(?:www\.)?tiktok\.com/@[^/]+/video/\d+
44
+ https?://(?:www\.)?tiktok\.com/@[^/]+/video/\d+
45
+
46
+ # ===== EXTRACTED PATTERNS FROM YT-DLP =====
47
+
48
+ # Source: arte
49
+ # Pattern: https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+
50
+ # Original: https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+
51
+ https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+
52
+
53
+ # Source: bilibili
54
+ # Pattern: https?://(?:www\.)?bilibili\.com/watchlater/?(?:[?#]|$)
55
+ # Original: https?://(?:www\.)?bilibili\.com/watchlater/?(?:[?#]|$)
56
+ https?://(?:www\.)?bilibili\.com/watchlater/?(?:[?#]|$)
57
+
58
+ # Source: bilibili
59
+ # Pattern: https?://(?:www\.)?bilibili\.com/v/[a-zA-Z]+\/[a-zA-Z]+
60
+ # Original: https?://(?:www\.)?bilibili\.com/v/[a-zA-Z]+\/[a-zA-Z]+
61
+ https?://(?:www\.)?bilibili\.com/v/[a-zA-Z]+\/[a-zA-Z]+
62
+
63
+ # Source: cbsnews
64
+ # Pattern: https?://(?:www\.)?cbsnews\.com/live/?(?:[?#]|$)
65
+ # Original: https?://(?:www\.)?cbsnews\.com/live/?(?:[?#]|$)
66
+ https?://(?:www\.)?cbsnews\.com/live/?(?:[?#]|$)
67
+
68
+ # Source: ciscolive
69
+ # Pattern: https?://(?:www\.)?ciscolive(?:\.cisco)?\.com/(?:global/)?on-demand-library(?:\.html|/)
70
+ # Original: https?://(?:www\.)?ciscolive(?:\.cisco)?\.com/(?:global/)?on-demand-library(?:\.html|/)
71
+ https?://(?:www\.)?ciscolive(?:\.cisco)?\.com/(?:global/)?on-demand-library(?:\.html|/)
72
+
73
+ # Source: commonprotocols
74
+ # Pattern: (?i)rtmp[est]?://.+
75
+ # Original: (?i)rtmp[est]?://.+
76
+ (?i)rtmp[est]?://.+
77
+
78
+ # Source: commonprotocols
79
+ # Pattern: (?i)mms://.+
80
+ # Original: (?i)mms://.+
81
+ (?i)mms://.+
82
+
83
+ # Source: cspan
84
+ # Pattern: https?://(?:www\.)?c-span\.org/congress/
85
+ # Original: https?://(?:www\.)?c-span\.org/congress/
86
+ https?://(?:www\.)?c-span\.org/congress/
87
+
88
+ # Source: dplay
89
+ # Pattern: https?://de\.hgtv\.com/sendungen
90
+ # Original: https?://de\.hgtv\.com/sendungen
91
+ https?://de\.hgtv\.com/sendungen
92
+
93
+ # Source: dplay
94
+ # Pattern: https?://(?:go\.)?discovery\.com/video
95
+ # Original: https?://(?:go\.)?discovery\.com/video
96
+ https?://(?:go\.)?discovery\.com/video
97
+
98
+ # Source: dplay
99
+ # Pattern: https?://(?:watch\.)?travelchannel\.com/video
100
+ # Original: https?://(?:watch\.)?travelchannel\.com/video
101
+ https?://(?:watch\.)?travelchannel\.com/video
102
+
103
+ # Source: dplay
104
+ # Pattern: https?://(?:watch\.)?cookingchanneltv\.com/video
105
+ # Original: https?://(?:watch\.)?cookingchanneltv\.com/video
106
+ https?://(?:watch\.)?cookingchanneltv\.com/video
107
+
108
+ # Source: dplay
109
+ # Pattern: https?://(?:watch\.)?hgtv\.com/video
110
+ # Original: https?://(?:watch\.)?hgtv\.com/video
111
+ https?://(?:watch\.)?hgtv\.com/video
112
+
113
+ # Source: dplay
114
+ # Pattern: https?://(?:watch\.)?foodnetwork\.com/video
115
+ # Original: https?://(?:watch\.)?foodnetwork\.com/video
116
+ https?://(?:watch\.)?foodnetwork\.com/video
117
+
118
+ # Source: dplay
119
+ # Pattern: https?://(?:www\.)?destinationamerica\.com/video
120
+ # Original: https?://(?:www\.)?destinationamerica\.com/video
121
+ https?://(?:www\.)?destinationamerica\.com/video
122
+
123
+ # Source: dplay
124
+ # Pattern: https?://(?:www\.)?investigationdiscovery\.com/video
125
+ # Original: https?://(?:www\.)?investigationdiscovery\.com/video
126
+ https?://(?:www\.)?investigationdiscovery\.com/video
127
+
128
+ # Source: dplay
129
+ # Pattern: https?://(?:www\.)?ahctv\.com/video
130
+ # Original: https?://(?:www\.)?ahctv\.com/video
131
+ https?://(?:www\.)?ahctv\.com/video
132
+
133
+ # Source: dplay
134
+ # Pattern: https?://(?:www\.)?sciencechannel\.com/video
135
+ # Original: https?://(?:www\.)?sciencechannel\.com/video
136
+ https?://(?:www\.)?sciencechannel\.com/video
137
+
138
+ # Source: dplay
139
+ # Pattern: https?://(?:www\.)?discoverylife\.com/video
140
+ # Original: https?://(?:www\.)?discoverylife\.com/video
141
+ https?://(?:www\.)?discoverylife\.com/video
142
+
143
+ # Source: dplay
144
+ # Pattern: https?://(?:www\.)?animalplanet\.com/video
145
+ # Original: https?://(?:www\.)?animalplanet\.com/video
146
+ https?://(?:www\.)?animalplanet\.com/video
147
+
148
+ # Source: dplay
149
+ # Pattern: https?://(?:go\.)?tlc\.com/video
150
+ # Original: https?://(?:go\.)?tlc\.com/video
151
+ https?://(?:go\.)?tlc\.com/video
152
+
153
+ # Source: dplay
154
+ # Pattern: https?://(?:www\.)?discoveryplus\.in/videos?
155
+ # Original: https?://(?:www\.)?discoveryplus\.in/videos?
156
+ https?://(?:www\.)?discoveryplus\.in/videos?
157
+
158
+ # Source: dplay
159
+ # Pattern: https?://(?:www\.)?discoveryplus\.com/it/video(?:/sport|/olympics)?
160
+ # Original: https?://(?:www\.)?discoveryplus\.com/it/video(?:/sport|/olympics)?
161
+ https?://(?:www\.)?discoveryplus\.com/it/video(?:/sport|/olympics)?
162
+
163
+ # Source: embedly
164
+ # Pattern: https?://(?:www|cdn\.)?embedly\.com/widgets/media\.html\?(?:[^#]*?&)?(?:src|url)=(?:[^#&]+)
165
+ # Original: https?://(?:www|cdn\.)?embedly\.com/widgets/media\.html\?(?:[^#]*?&)?(?:src|url)=(?:[^#&]+)
166
+ https?://(?:www|cdn\.)?embedly\.com/widgets/media\.html\?(?:[^#]*?&)?(?:src|url)=(?:[^#&]+)
167
+
168
+ # Source: facebook
169
+ # Pattern: https?://(?:[\w-]+\.)?facebook\.com/flx/warn[/?]
170
+ # Original: https?://(?:[\w-]+\.)?facebook\.com/flx/warn[/?]
171
+ https?://(?:[\w-]+\.)?facebook\.com/flx/warn[/?]
172
+
173
+ # Source: getcourseru
174
+ # Pattern: https?://(?:player02\.getcourse\.ru|cf-api-2\.vhcdn\.com)/sign-player/?\?(?:[^#]+&)?json=[^#&]+
175
+ # Original: https?://(?:player02\.getcourse\.ru|cf-api-2\.vhcdn\.com)/sign-player/?\?(?:[^#]+&)?json=[^#&]+
176
+ https?://(?:player02\.getcourse\.ru|cf-api-2\.vhcdn\.com)/sign-player/?\?(?:[^#]+&)?json=[^#&]+
177
+
178
+ # Source: gronkh
179
+ # Pattern: https?://(?:www\.)?gronkh\.tv(?:/feed)?/?(?:#|$)
180
+ # Original: https?://(?:www\.)?gronkh\.tv(?:/feed)?/?(?:#|$)
181
+ https?://(?:www\.)?gronkh\.tv(?:/feed)?/?(?:#|$)
182
+
183
+ # Source: gronkh
184
+ # Pattern: https?://(?:www\.)?gronkh\.tv/vods/streams/?(?:#|$)
185
+ # Original: https?://(?:www\.)?gronkh\.tv/vods/streams/?(?:#|$)
186
+ https?://(?:www\.)?gronkh\.tv/vods/streams/?(?:#|$)
187
+
188
+ # Source: internetvideoarchive
189
+ # Pattern: https?://video\.internetvideoarchive\.net/(?:player|flash/players)/.*?\?.*?publishedid.*?
190
+ # Original: https?://video\.internetvideoarchive\.net/(?:player|flash/players)/.*?\?.*?publishedid.*?
191
+ https?://video\.internetvideoarchive\.net/(?:player|flash/players)/.*?\?.*?publishedid.*?
192
+
193
+ # Source: iqiyi
194
+ # Pattern: https?://(?:(?:[^.]+\.)?iqiyi\.com|www\.pps\.tv)/.+\.html
195
+ # Original: https?://(?:(?:[^.]+\.)?iqiyi\.com|www\.pps\.tv)/.+\.html
196
+ https?://(?:(?:[^.]+\.)?iqiyi\.com|www\.pps\.tv)/.+\.html
197
+
198
+ # Source: japandiet
199
+ # Pattern: https?://www\.webtv\.sangiin\.go\.jp/webtv/index\.php
200
+ # Original: https?://www\.webtv\.sangiin\.go\.jp/webtv/index\.php
201
+ https?://www\.webtv\.sangiin\.go\.jp/webtv/index\.php
202
+
203
+ # Source: jeuxvideo
204
+ # Pattern: https?://.*?\.jeuxvideo\.com/.*/(.*?)\.htm
205
+ # Original: https?://.*?\.jeuxvideo\.com/.*/(.*?)\.htm
206
+ https?://.*?\.jeuxvideo\.com/.*/(.*?)\.htm
207
+
208
+ # Source: leeco
209
+ # Pattern: https?://yuntv\.letv\.com/bcloud.html\?.+
210
+ # Original: https?://yuntv\.letv\.com/bcloud.html\?.+
211
+ https?://yuntv\.letv\.com/bcloud.html\?.+
212
+
213
+ # Source: manoto
214
+ # Pattern: https?://(?:www\.)?manototv\.com/live/
215
+ # Original: https?://(?:www\.)?manototv\.com/live/
216
+ https?://(?:www\.)?manototv\.com/live/
217
+
218
+ # Source: mediaite
219
+ # Pattern: https?://(?:www\.)?mediaite\.com(?:/[\w-]+){2}
220
+ # Original: https?://(?:www\.)?mediaite\.com(?!/category)(?:/[\w-]+){2}
221
+ https?://(?:www\.)?mediaite\.com(?:/[\w-]+){2}
222
+
223
+ # Source: mixcloud
224
+ # Pattern: https?://(?:(?:www|beta|m)\.)?mixcloud\.com/([^/]+)/([^/]+)
225
+ # Original: https?://(?:(?:www|beta|m)\.)?mixcloud\.com/([^/]+)/(?!stream|uploads|favorites|listens|playlists)([^/]+)
226
+ https?://(?:(?:www|beta|m)\.)?mixcloud\.com/([^/]+)/([^/]+)
227
+
228
+ # Source: muenchentv
229
+ # Pattern: https?://(?:www\.)?muenchen\.tv/livestream
230
+ # Original: https?://(?:www\.)?muenchen\.tv/livestream
231
+ https?://(?:www\.)?muenchen\.tv/livestream
232
+
233
+ # Source: nationalgeographic
234
+ # Pattern: https?://video\.nationalgeographic\.com/.*?
235
+ # Original: https?://video\.nationalgeographic\.com/.*?
236
+ https?://video\.nationalgeographic\.com/.*?
237
+
238
+ # Source: nerdcubed
239
+ # Pattern: https?://(?:www\.)?nerdcubed\.co\.uk/?(?:$|[#?])
240
+ # Original: https?://(?:www\.)?nerdcubed\.co\.uk/?(?:$|[#?])
241
+ https?://(?:www\.)?nerdcubed\.co\.uk/?(?:$|[#?])
242
+
243
+ # Source: nhk
244
+ # Pattern: https?://www\.nhk\.or\.jp/radionews/?(?:$|[?#])
245
+ # Original: https?://www\.nhk\.or\.jp/radionews/?(?:$|[?#])
246
+ https?://www\.nhk\.or\.jp/radionews/?(?:$|[?#])
247
+
248
+ # Source: nobelprize
249
+ # Pattern: https?://(?:(?:mediaplayer|www)\.)?nobelprize\.org/mediaplayer/
250
+ # Original: https?://(?:(?:mediaplayer|www)\.)?nobelprize\.org/mediaplayer/
251
+ https?://(?:(?:mediaplayer|www)\.)?nobelprize\.org/mediaplayer/
252
+
253
+ # Source: pluralsight
254
+ # Pattern: https?://(?:(?:www|app)\.)?pluralsight\.com/(?:training/)?player\?
255
+ # Original: https?://(?:(?:www|app)\.)?pluralsight\.com/(?:training/)?player\?
256
+ https?://(?:(?:www|app)\.)?pluralsight\.com/(?:training/)?player\?
257
+
258
+ # Source: rtvcplay
259
+ # Pattern: https?://(?:www\.)?rtvcplay\.co
260
+ # Original: https?://(?:www\.)?rtvcplay\.co
261
+ https?://(?:www\.)?rtvcplay\.co
262
+
263
+ # Source: seznamzpravy
264
+ # Pattern: https?://(?:www\.)?seznamzpravy\.cz/iframe/player\?.*\bsrc=
265
+ # Original: https?://(?:www\.)?seznamzpravy\.cz/iframe/player\?.*\bsrc=
266
+ https?://(?:www\.)?seznamzpravy\.cz/iframe/player\?.*\bsrc=
267
+
268
+ # Source: skyit
269
+ # Pattern: https?://(?:www\.)?tv8\.it/streaming
270
+ # Original: https?://(?:www\.)?tv8\.it/streaming
271
+ https?://(?:www\.)?tv8\.it/streaming
272
+
273
+ # Source: ted
274
+ # Pattern: https?://embed(?:-ssl)?\.ted\.com/
275
+ # Original: https?://embed(?:-ssl)?\.ted\.com/
276
+ https?://embed(?:-ssl)?\.ted\.com/
277
+
278
+ # Source: tmz
279
+ # Pattern: https?://(?:www\.)?tmz\.com/.*
280
+ # Original: https?://(?:www\.)?tmz\.com/.*
281
+ https?://(?:www\.)?tmz\.com/.*
282
+
283
+ # Source: vimeo
284
+ # Pattern: https://vimeo\.com/(?:home/)?watchlater|:vimeowatchlater
285
+ # Original: https://vimeo\.com/(?:home/)?watchlater|:vimeowatchlater
286
+ https://vimeo\.com/(?:home/)?watchlater|:vimeowatchlater
287
+
288
+ # Source: vrsquare
289
+ # Pattern: https?://livr\.jp/web-search/?\?(?:[^#]+&)?w=[^#]+
290
+ # Original: https?://livr\.jp/web-search/?\?(?:[^#]+&)?w=[^#]+
291
+ https?://livr\.jp/web-search/?\?(?:[^#]+&)?w=[^#]+
292
+
293
+ # Source: wdr
294
+ # Pattern: https?://(?:www\d?\.)?(?:(?:kinder\.)?wdr\d?|sportschau)\.de
295
+ # Original: https?://(?:www\d?\.)?(?:(?:kinder\.)?wdr\d?|sportschau)\.de
296
+ https?://(?:www\d?\.)?(?:(?:kinder\.)?wdr\d?|sportschau)\.de
297
+
@@ -34,6 +34,7 @@ Gem::Specification.new do |spec|
34
34
  spec.add_dependency 'nokogiri', '>= 1.18.9', '< 2.0'
35
35
  spec.add_dependency 'resolv', '>= 0.4.0', '< 1.0'
36
36
  spec.add_dependency 'rubyzip', '>= 2.3.0', '< 3.0'
37
+ spec.add_dependency 'reline', ">= 0.6.2"
37
38
 
38
39
  # Development dependancies
39
40
  spec.add_development_dependency 'minitest', '~> 5.25.5'
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: UrlCategorise
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.6
4
+ version: 0.1.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - trex22
@@ -229,6 +229,20 @@ dependencies:
229
229
  - - "<"
230
230
  - !ruby/object:Gem::Version
231
231
  version: '3.0'
232
+ - !ruby/object:Gem::Dependency
233
+ name: reline
234
+ requirement: !ruby/object:Gem::Requirement
235
+ requirements:
236
+ - - ">="
237
+ - !ruby/object:Gem::Version
238
+ version: 0.6.2
239
+ type: :runtime
240
+ prerelease: false
241
+ version_requirements: !ruby/object:Gem::Requirement
242
+ requirements:
243
+ - - ">="
244
+ - !ruby/object:Gem::Version
245
+ version: 0.6.2
232
246
  - !ruby/object:Gem::Dependency
233
247
  name: minitest
234
248
  requirement: !ruby/object:Gem::Requirement
@@ -423,12 +437,14 @@ files:
423
437
  - bin/console
424
438
  - bin/export_csv
425
439
  - bin/export_hosts
440
+ - bin/generate_video_lists
426
441
  - bin/rake
427
442
  - bin/setup
428
443
  - correct_usage_example.rb
429
444
  - docs/.keep
430
445
  - docs/v0.1-context.md
431
446
  - docs/v0.1.4-features.md
447
+ - docs/video-url-detection.md
432
448
  - lib/url_categorise.rb
433
449
  - lib/url_categorise/active_record_client.rb
434
450
  - lib/url_categorise/client.rb
@@ -437,6 +453,8 @@ files:
437
453
  - lib/url_categorise/iab_compliance.rb
438
454
  - lib/url_categorise/models.rb
439
455
  - lib/url_categorise/version.rb
456
+ - lists/video_hosting_domains.hosts
457
+ - lists/video_url_patterns.txt
440
458
  - url_categorise.gemspec
441
459
  homepage: https://github.com/TRex22/UrlCategorise
442
460
  licenses: