gtfs_stops_clustering 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 6a9a16f2fa8980f4bd7e84e9912e7925caa556fd2b2daab56209ff8351e93a19
4
+ data.tar.gz: 652b0895096c0b55009d669ea1b8416368dec5884b4a395ca4fd46c762836575
5
+ SHA512:
6
+ metadata.gz: 3ea8cd7f921ff06aa5e838684d1efb8fcaa48fa3b2601e85ed6f926fecebc9d4bb8831399d5bd0e63cf1a2531231e5d8c71b4e56a3287c9ef0aa602b22726a7f
7
+ data.tar.gz: 87bea9a4711300f07b9b6a0151590de265e6c55447a9c1e6ad3b7503b1ffaa45ac5090e1b1859a4786765b1709826519f84876504faac815d7c85282ee1857ed
@@ -0,0 +1,60 @@
1
+ # lib/data_import.rb
2
+
3
+ require 'csv'
4
+ require 'gtfs'
5
+
6
+ module DataImport
7
+ VERSION='0.0.1'
8
+ attr_accessor :data_import
9
+
10
+ class DataImport
11
+ attr_accessor :stops, :stops_config_file, :stops_names, :stops_corner_cases, :stops_data, :stops_redis_geodata
12
+
13
+ def initialize(stops, stops_config_file)
14
+ @stops = stops
15
+ @stops_config_file = stops_config_file
16
+ @stops_corner_cases = []
17
+ @stops_names = []
18
+ @stops_data = []
19
+ @stops_redis_geodata = []
20
+ import_stops_corner_cases
21
+ import_stops_data
22
+ end
23
+
24
+ def import_stops_corner_cases
25
+ if File.exist?(@stops_config_file)
26
+ CSV.foreach(@stops_config_file, headers: true) do |row|
27
+ stop_name = row['stop_name']
28
+ cluster_name = row['cluster_name']
29
+
30
+ stops_corner_cases << { stop_name: stop_name, cluster_name: cluster_name }
31
+ end
32
+ end
33
+ end
34
+
35
+ def import_stops_data
36
+ @stops.each do |row|
37
+ latitude = row.lat
38
+ longitude = row.lon
39
+ stop_name = row.name
40
+
41
+ stop_name = @stops_corner_cases.find { |entry| entry[:stop_name] == stop_name }[:cluster_name] if stops_corner_cases.find { |entry| entry[:stop_name] == stop_name }
42
+
43
+ @stops_names << stop_name
44
+ @stops_data << [latitude, longitude]
45
+ @stops_redis_geodata << [longitude, latitude, "#{longitude},#{latitude}"]
46
+ end
47
+ end
48
+ end
49
+
50
+ def import_stops_data(*args)
51
+ @data_import = DataImport.new(*args)
52
+ {
53
+ stops_data: @data_import.stops_data,
54
+ stops_names: @data_import.stops_names,
55
+ stops_redis_geodata: @data_import.stops_redis_geodata
56
+ }
57
+ end
58
+ end
59
+
60
+ include DataImport
@@ -0,0 +1,180 @@
1
+ ## https://github.com/shiguodong/dbscan (fork)
2
+
3
+ require 'distance_measures'
4
+ require 'text'
5
+ require 'geocoder'
6
+ require_relative 'redis_geodata'
7
+
8
+ class Array
9
+ def haversine_distance2(n)
10
+ Geocoder::Calculations.distance_between(self, n)
11
+ end
12
+ end
13
+
14
+ module DBSCAN
15
+ class Clusterer
16
+ attr_accessor :points, :options, :clusters
17
+
18
+ def initialize(points, stops_redis_geodata, options = {})
19
+ options[:distance] = :euclidean_distance unless options[:distance]
20
+ options[:labels] = [] unless options[:labels]
21
+
22
+ c = 0
23
+ redis_geodata_import(stops_redis_geodata, options[:epsilon])
24
+ @points = points.map { |e| po = Point.new(e, options[:labels][c]); c +=1; po }
25
+ @options = options
26
+ @clusters = {-1 => []}
27
+
28
+ clusterize!
29
+ end
30
+
31
+ def clusterize!
32
+ current_cluster = -1
33
+ @points.each do |point|
34
+ next if point.visited?
35
+
36
+ point.visit!
37
+ neighbors = inmediate_neighbors(point)
38
+
39
+ if neighbors.size >= options[:min_points]
40
+ current_cluster += 1
41
+ point.cluster = current_cluster
42
+ cluster = [point].push(add_connected(neighbors, current_cluster))
43
+ clusters[current_cluster] = cluster.flatten
44
+
45
+ # Get Cluster Name
46
+ labels = clusters[current_cluster].map { |e| e.label.capitalize }
47
+ cluster_name = find_cluster_name(labels)
48
+
49
+ # Get Cluster Position
50
+ cluster_pos = find_cluster_position(clusters[current_cluster])
51
+
52
+ clusters[current_cluster].each { |e|
53
+ e.cluster_name = cluster_name
54
+ e.cluster_pos = cluster_pos
55
+ }
56
+ else
57
+ clusters[-1].push(point)
58
+ end
59
+ end
60
+ end
61
+
62
+ def results
63
+ hash = {}
64
+ @clusters.dup.each { |cluster_index, value| hash[cluster_index] = value.flatten.map(&:items) unless value.flatten.empty? }
65
+ hash
66
+ end
67
+
68
+ def labeled_results
69
+ hash = {}
70
+ @clusters.each do |cluster_index, elements|
71
+ hash.store(cluster_index, [])
72
+ elements.each do |e|
73
+ hash[cluster_index].push(
74
+ {
75
+ stop_id: nil,
76
+ stop_code: nil,
77
+ cluster_name: e.cluster_name,
78
+ cluster_pos: e.cluster_pos,
79
+ stop_name: e.label,
80
+ stop_lat: e.items[0],
81
+ stop_lon: e.items[1],
82
+ parent_station: nil
83
+ }
84
+ )
85
+ end
86
+ end
87
+ hash
88
+ end
89
+
90
+ def inmediate_neighbors(point)
91
+ neighbors = []
92
+ geosearch_results = geosearch(point.items[1], point.items[0])
93
+ geosearch_results.each do |neighbor_pos|
94
+ coordinates = neighbor_pos.split(',')
95
+ neighbor = @points.find { |point| point.items[0] == coordinates[1] &&
96
+ point.items[1] == coordinates[0] }
97
+ next unless neighbor
98
+
99
+ string_distance = Text::Levenshtein.distance(point.label.downcase, neighbor.label.downcase)
100
+ similarity = 1 - string_distance.to_f / [point.label.length, point.label.length].max
101
+ neighbors.push(neighbor) if similarity > options[:similarity]
102
+ end
103
+ neighbors
104
+ end
105
+
106
+ def add_connected(neighbors, current_cluster)
107
+ cluster_points = []
108
+ neighbors.each do |point|
109
+ unless point.visited?
110
+ point.visit!
111
+ new_points = inmediate_neighbors(point)
112
+
113
+ if new_points.size >= options[:min_points]
114
+ new_points.each do |p|
115
+ unless neighbors.include?(p)
116
+ neighbors.push(p)
117
+ end
118
+ end
119
+ end
120
+ end
121
+
122
+ unless point.cluster
123
+ cluster_points.push(point)
124
+ point.cluster = current_cluster
125
+ end
126
+ end
127
+
128
+ cluster_points
129
+ end
130
+ end
131
+
132
+ def find_cluster_name(labels)
133
+ words = labels.map { |label| label.strip.split }
134
+ common_title = ''
135
+
136
+ # Loop through each word index starting from the first
137
+ (0...words.first.length).each do |i|
138
+ words_at_index = words.map { |word_list| word_list[i] }
139
+
140
+ break unless words_at_index.uniq.length == 1
141
+
142
+ common_title += " #{words_at_index.first.capitalize}"
143
+ end
144
+
145
+ common_title.strip! ? common_title : labels.first
146
+ end
147
+
148
+ def find_cluster_position(cluster)
149
+ total_lat = cluster.map { |e| e.items[0].to_f }.sum
150
+ total_lon = cluster.map { |e| e.items[1].to_f }.sum
151
+ avg_lat = total_lat / cluster.size
152
+ avg_lon = total_lon / cluster.size
153
+ [avg_lat, avg_lon]
154
+ end
155
+
156
+ class Point
157
+ attr_accessor :items, :cluster, :visited, :label, :cluster_name, :cluster_pos
158
+
159
+ define_method(:visited?) { @visited }
160
+ define_method(:visit!) { @visited = true }
161
+ def initialize(point, label)
162
+ @items,
163
+ @cluster,
164
+ @visited,
165
+ @label = point,
166
+ nil,
167
+ false,
168
+ label,
169
+ @cluster_name,
170
+ @cluster_pos = []
171
+ end
172
+ end
173
+
174
+ def DBSCAN(* args)
175
+ clusterer = Clusterer.new(*args)
176
+ clusterer.labeled_results
177
+ end
178
+ end
179
+
180
+ include DBSCAN
@@ -0,0 +1,40 @@
1
+ # lib/redis_geodata.rb
2
+ require 'redis'
3
+
4
+ module RedisGeodata
5
+ VERSION='0.0.1'
6
+ attr_accessor :redis
7
+
8
+ class RedisGeodata
9
+ attr_accessor :stops, :key, :redis, :epsilon
10
+
11
+ def initialize(stops, epsilon)
12
+ @redis = Redis.new(url: 'redis://127.0.0.1:6379')
13
+ @stops = stops
14
+ @key = 'stops'
15
+ @epsilon = epsilon
16
+ geoadd
17
+ end
18
+
19
+ def geoadd
20
+ @redis.geoadd(@key, *@stops)
21
+ @redis.expire(@key, 100_000_0)
22
+ end
23
+
24
+ def geosearch(longitude, latitude)
25
+ list = @redis.georadius(@key, longitude, latitude, @epsilon, 'km')
26
+ list.reject! { |point| point == longitude.to_s + "," + latitude.to_s }
27
+ list
28
+ end
29
+ end
30
+
31
+ def redis_geodata_import(*args)
32
+ @redis = RedisGeodata.new(*args)
33
+ end
34
+
35
+ def geosearch(*args)
36
+ @redis.geosearch(*args)
37
+ end
38
+ end
39
+
40
+ include RedisGeodata
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module GtfsStopsClustering
4
+ VERSION = "0.1.0"
5
+ end
@@ -0,0 +1,69 @@
1
+ # lib/gtfs_stops_clustering.rb
2
+
3
+ require 'gtfs'
4
+ require 'csv'
5
+ require_relative './gtfs_stops_clustering/data_import'
6
+ require_relative './gtfs_stops_clustering/dbscan'
7
+
8
+ module GtfsStopsClustering
9
+ VERSION='0.0.1'
10
+ attr_accessor :gtfs_stops_clustering
11
+
12
+ class GtfsStopsClustering
13
+ attr_accessor :clusters, :gtfs_urls, :gtfs_stops, :stops_config_path, :epsilon, :min_points, :names_similarity
14
+
15
+ def initialize(gtfs_urls, epsilon, min_points, names_similarity, stops_config_path)
16
+ @clusters = []
17
+ unless gtfs_urls.empty?
18
+ @gtfs_paths = gtfs_urls
19
+ @stops_config_path = stops_config_path
20
+ @epsilon = epsilon
21
+ @min_points = min_points
22
+ @names_similarity = names_similarity
23
+ @gtfs_stops = create_stops_merged
24
+ clusterize_stops_csv(@gtfs_stops)
25
+ end
26
+ end
27
+
28
+ def create_stops_merged
29
+ gtfs_stops = []
30
+ @gtfs_paths.each do |gtfs_path|
31
+ gtfs = GTFS::Source.build(gtfs_path)
32
+ gtfs_stops << gtfs.stops
33
+ end
34
+ gtfs_stops.flatten
35
+ end
36
+
37
+ def clusterize_stops_csv(stops_merged)
38
+ data = import_stops_data(stops_merged, @stops_config_path)
39
+ @clusters = DBSCAN( data[:stops_data], data[:stops_redis_geodata], :epsilon => @epsilon, :min_points => @min_points, :similarity => @names_similarity, :distance => :haversine_distance2, :labels => data[:stops_names] )
40
+
41
+ @clusters.each do |cluster_id, cluster|
42
+ cluster.each do |stop|
43
+ gtfs_stop = @gtfs_stops.find { |e| e.lat == stop[:stop_lat] && e.lon == stop[:stop_lon] }
44
+ stop[:stop_id] = gtfs_stop.id
45
+ stop[:stop_code] = gtfs_stop.code
46
+ stop[:parent_station] = gtfs_stop.parent_station
47
+ end
48
+ end
49
+
50
+ output_path = 'stop_clusters.txt'
51
+ File.open(output_path, 'w') do |file|
52
+ @clusters.each do |cluster_id, cluster |
53
+ file.puts "Cluster #{cluster_id}"
54
+ cluster.each do |point|
55
+ file.puts point.inspect
56
+ end
57
+ file.puts
58
+ end
59
+ end
60
+ end
61
+ end
62
+
63
+ def gtfs_stops_clusters(gtfs_urls, epsilon, min_points, names_similarity = 1, stop_config_path = '')
64
+ @gtfs_stops_clustering = GtfsStopsClustering.new(gtfs_urls, epsilon, min_points, names_similarity, stop_config_path)
65
+ @gtfs_stops_clustering.clusters
66
+ end
67
+ end
68
+
69
+ include GtfsStopsClustering
metadata ADDED
@@ -0,0 +1,158 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: gtfs_stops_clustering
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.1
5
+ platform: ruby
6
+ authors:
7
+ - Visco01
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2023-12-06 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: distance_measures
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: 0.0.6
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: 0.0.6
27
+ - !ruby/object:Gem::Dependency
28
+ name: text
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '1.3'
34
+ - - ">="
35
+ - !ruby/object:Gem::Version
36
+ version: 1.3.1
37
+ type: :runtime
38
+ prerelease: false
39
+ version_requirements: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - "~>"
42
+ - !ruby/object:Gem::Version
43
+ version: '1.3'
44
+ - - ">="
45
+ - !ruby/object:Gem::Version
46
+ version: 1.3.1
47
+ - !ruby/object:Gem::Dependency
48
+ name: geocoder
49
+ requirement: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - "~>"
52
+ - !ruby/object:Gem::Version
53
+ version: '1.8'
54
+ - - ">="
55
+ - !ruby/object:Gem::Version
56
+ version: 1.8.2
57
+ type: :runtime
58
+ prerelease: false
59
+ version_requirements: !ruby/object:Gem::Requirement
60
+ requirements:
61
+ - - "~>"
62
+ - !ruby/object:Gem::Version
63
+ version: '1.8'
64
+ - - ">="
65
+ - !ruby/object:Gem::Version
66
+ version: 1.8.2
67
+ - !ruby/object:Gem::Dependency
68
+ name: csv
69
+ requirement: !ruby/object:Gem::Requirement
70
+ requirements:
71
+ - - "~>"
72
+ - !ruby/object:Gem::Version
73
+ version: '3.2'
74
+ - - ">="
75
+ - !ruby/object:Gem::Version
76
+ version: 3.2.8
77
+ type: :runtime
78
+ prerelease: false
79
+ version_requirements: !ruby/object:Gem::Requirement
80
+ requirements:
81
+ - - "~>"
82
+ - !ruby/object:Gem::Version
83
+ version: '3.2'
84
+ - - ">="
85
+ - !ruby/object:Gem::Version
86
+ version: 3.2.8
87
+ - !ruby/object:Gem::Dependency
88
+ name: redis
89
+ requirement: !ruby/object:Gem::Requirement
90
+ requirements:
91
+ - - "~>"
92
+ - !ruby/object:Gem::Version
93
+ version: '5.0'
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: 5.0.8
97
+ type: :runtime
98
+ prerelease: false
99
+ version_requirements: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - "~>"
102
+ - !ruby/object:Gem::Version
103
+ version: '5.0'
104
+ - - ">="
105
+ - !ruby/object:Gem::Version
106
+ version: 5.0.8
107
+ - !ruby/object:Gem::Dependency
108
+ name: gtfs
109
+ requirement: !ruby/object:Gem::Requirement
110
+ requirements:
111
+ - - "~>"
112
+ - !ruby/object:Gem::Version
113
+ version: 0.4.1
114
+ type: :runtime
115
+ prerelease: false
116
+ version_requirements: !ruby/object:Gem::Requirement
117
+ requirements:
118
+ - - "~>"
119
+ - !ruby/object:Gem::Version
120
+ version: 0.4.1
121
+ description: A gem to read GTFS stops data and create clusters based on coordinates
122
+ and stop names' similarities.
123
+ email:
124
+ - visconti373@gmail.com
125
+ executables: []
126
+ extensions: []
127
+ extra_rdoc_files: []
128
+ files:
129
+ - lib/gtfs_stops_clustering.rb
130
+ - lib/gtfs_stops_clustering/data_import.rb
131
+ - lib/gtfs_stops_clustering/dbscan.rb
132
+ - lib/gtfs_stops_clustering/redis_geodata.rb
133
+ - lib/gtfs_stops_clustering/version.rb
134
+ homepage:
135
+ licenses:
136
+ - MIT
137
+ metadata: {}
138
+ post_install_message:
139
+ rdoc_options: []
140
+ require_paths:
141
+ - lib
142
+ required_ruby_version: !ruby/object:Gem::Requirement
143
+ requirements:
144
+ - - ">="
145
+ - !ruby/object:Gem::Version
146
+ version: 2.6.0
147
+ required_rubygems_version: !ruby/object:Gem::Requirement
148
+ requirements:
149
+ - - ">="
150
+ - !ruby/object:Gem::Version
151
+ version: '0'
152
+ requirements: []
153
+ rubygems_version: 3.4.10
154
+ signing_key:
155
+ specification_version: 4
156
+ summary: A gem to read GTFS stops data and create clusters based on coordinates and
157
+ stop names' similarities.
158
+ test_files: []