gtfs_stops_clustering 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 6a9a16f2fa8980f4bd7e84e9912e7925caa556fd2b2daab56209ff8351e93a19
4
+ data.tar.gz: 652b0895096c0b55009d669ea1b8416368dec5884b4a395ca4fd46c762836575
5
+ SHA512:
6
+ metadata.gz: 3ea8cd7f921ff06aa5e838684d1efb8fcaa48fa3b2601e85ed6f926fecebc9d4bb8831399d5bd0e63cf1a2531231e5d8c71b4e56a3287c9ef0aa602b22726a7f
7
+ data.tar.gz: 87bea9a4711300f07b9b6a0151590de265e6c55447a9c1e6ad3b7503b1ffaa45ac5090e1b1859a4786765b1709826519f84876504faac815d7c85282ee1857ed
@@ -0,0 +1,60 @@
1
+ # lib/data_import.rb
2
+
3
+ require 'csv'
4
+ require 'gtfs'
5
+
6
+ module DataImport
7
+ VERSION='0.0.1'
8
+ attr_accessor :data_import
9
+
10
+ class DataImport
11
+ attr_accessor :stops, :stops_config_file, :stops_names, :stops_corner_cases, :stops_data, :stops_redis_geodata
12
+
13
+ def initialize(stops, stops_config_file)
14
+ @stops = stops
15
+ @stops_config_file = stops_config_file
16
+ @stops_corner_cases = []
17
+ @stops_names = []
18
+ @stops_data = []
19
+ @stops_redis_geodata = []
20
+ import_stops_corner_cases
21
+ import_stops_data
22
+ end
23
+
24
+ def import_stops_corner_cases
25
+ if File.exist?(@stops_config_file)
26
+ CSV.foreach(@stops_config_file, headers: true) do |row|
27
+ stop_name = row['stop_name']
28
+ cluster_name = row['cluster_name']
29
+
30
+ stops_corner_cases << { stop_name: stop_name, cluster_name: cluster_name }
31
+ end
32
+ end
33
+ end
34
+
35
+ def import_stops_data
36
+ @stops.each do |row|
37
+ latitude = row.lat
38
+ longitude = row.lon
39
+ stop_name = row.name
40
+
41
+ stop_name = @stops_corner_cases.find { |entry| entry[:stop_name] == stop_name }[:cluster_name] if stops_corner_cases.find { |entry| entry[:stop_name] == stop_name }
42
+
43
+ @stops_names << stop_name
44
+ @stops_data << [latitude, longitude]
45
+ @stops_redis_geodata << [longitude, latitude, "#{longitude},#{latitude}"]
46
+ end
47
+ end
48
+ end
49
+
50
+ def import_stops_data(*args)
51
+ @data_import = DataImport.new(*args)
52
+ {
53
+ stops_data: @data_import.stops_data,
54
+ stops_names: @data_import.stops_names,
55
+ stops_redis_geodata: @data_import.stops_redis_geodata
56
+ }
57
+ end
58
+ end
59
+
60
+ include DataImport
@@ -0,0 +1,180 @@
1
+ ## https://github.com/shiguodong/dbscan (fork)
2
+
3
+ require 'distance_measures'
4
+ require 'text'
5
+ require 'geocoder'
6
+ require_relative 'redis_geodata'
7
+
8
+ class Array
9
+ def haversine_distance2(n)
10
+ Geocoder::Calculations.distance_between(self, n)
11
+ end
12
+ end
13
+
14
+ module DBSCAN
15
+ class Clusterer
16
+ attr_accessor :points, :options, :clusters
17
+
18
+ def initialize(points, stops_redis_geodata, options = {})
19
+ options[:distance] = :euclidean_distance unless options[:distance]
20
+ options[:labels] = [] unless options[:labels]
21
+
22
+ c = 0
23
+ redis_geodata_import(stops_redis_geodata, options[:epsilon])
24
+ @points = points.map { |e| po = Point.new(e, options[:labels][c]); c +=1; po }
25
+ @options = options
26
+ @clusters = {-1 => []}
27
+
28
+ clusterize!
29
+ end
30
+
31
+ def clusterize!
32
+ current_cluster = -1
33
+ @points.each do |point|
34
+ next if point.visited?
35
+
36
+ point.visit!
37
+ neighbors = inmediate_neighbors(point)
38
+
39
+ if neighbors.size >= options[:min_points]
40
+ current_cluster += 1
41
+ point.cluster = current_cluster
42
+ cluster = [point].push(add_connected(neighbors, current_cluster))
43
+ clusters[current_cluster] = cluster.flatten
44
+
45
+ # Get Cluster Name
46
+ labels = clusters[current_cluster].map { |e| e.label.capitalize }
47
+ cluster_name = find_cluster_name(labels)
48
+
49
+ # Get Cluster Position
50
+ cluster_pos = find_cluster_position(clusters[current_cluster])
51
+
52
+ clusters[current_cluster].each { |e|
53
+ e.cluster_name = cluster_name
54
+ e.cluster_pos = cluster_pos
55
+ }
56
+ else
57
+ clusters[-1].push(point)
58
+ end
59
+ end
60
+ end
61
+
62
+ def results
63
+ hash = {}
64
+ @clusters.dup.each { |cluster_index, value| hash[cluster_index] = value.flatten.map(&:items) unless value.flatten.empty? }
65
+ hash
66
+ end
67
+
68
+ def labeled_results
69
+ hash = {}
70
+ @clusters.each do |cluster_index, elements|
71
+ hash.store(cluster_index, [])
72
+ elements.each do |e|
73
+ hash[cluster_index].push(
74
+ {
75
+ stop_id: nil,
76
+ stop_code: nil,
77
+ cluster_name: e.cluster_name,
78
+ cluster_pos: e.cluster_pos,
79
+ stop_name: e.label,
80
+ stop_lat: e.items[0],
81
+ stop_lon: e.items[1],
82
+ parent_station: nil
83
+ }
84
+ )
85
+ end
86
+ end
87
+ hash
88
+ end
89
+
90
+ def inmediate_neighbors(point)
91
+ neighbors = []
92
+ geosearch_results = geosearch(point.items[1], point.items[0])
93
+ geosearch_results.each do |neighbor_pos|
94
+ coordinates = neighbor_pos.split(',')
95
+ neighbor = @points.find { |point| point.items[0] == coordinates[1] &&
96
+ point.items[1] == coordinates[0] }
97
+ next unless neighbor
98
+
99
+ string_distance = Text::Levenshtein.distance(point.label.downcase, neighbor.label.downcase)
100
+ similarity = 1 - string_distance.to_f / [point.label.length, point.label.length].max
101
+ neighbors.push(neighbor) if similarity > options[:similarity]
102
+ end
103
+ neighbors
104
+ end
105
+
106
+ def add_connected(neighbors, current_cluster)
107
+ cluster_points = []
108
+ neighbors.each do |point|
109
+ unless point.visited?
110
+ point.visit!
111
+ new_points = inmediate_neighbors(point)
112
+
113
+ if new_points.size >= options[:min_points]
114
+ new_points.each do |p|
115
+ unless neighbors.include?(p)
116
+ neighbors.push(p)
117
+ end
118
+ end
119
+ end
120
+ end
121
+
122
+ unless point.cluster
123
+ cluster_points.push(point)
124
+ point.cluster = current_cluster
125
+ end
126
+ end
127
+
128
+ cluster_points
129
+ end
130
+ end
131
+
132
+ def find_cluster_name(labels)
133
+ words = labels.map { |label| label.strip.split }
134
+ common_title = ''
135
+
136
+ # Loop through each word index starting from the first
137
+ (0...words.first.length).each do |i|
138
+ words_at_index = words.map { |word_list| word_list[i] }
139
+
140
+ break unless words_at_index.uniq.length == 1
141
+
142
+ common_title += " #{words_at_index.first.capitalize}"
143
+ end
144
+
145
+ common_title.strip! ? common_title : labels.first
146
+ end
147
+
148
+ def find_cluster_position(cluster)
149
+ total_lat = cluster.map { |e| e.items[0].to_f }.sum
150
+ total_lon = cluster.map { |e| e.items[1].to_f }.sum
151
+ avg_lat = total_lat / cluster.size
152
+ avg_lon = total_lon / cluster.size
153
+ [avg_lat, avg_lon]
154
+ end
155
+
156
+ class Point
157
+ attr_accessor :items, :cluster, :visited, :label, :cluster_name, :cluster_pos
158
+
159
+ define_method(:visited?) { @visited }
160
+ define_method(:visit!) { @visited = true }
161
+ def initialize(point, label)
162
+ @items,
163
+ @cluster,
164
+ @visited,
165
+ @label = point,
166
+ nil,
167
+ false,
168
+ label,
169
+ @cluster_name,
170
+ @cluster_pos = []
171
+ end
172
+ end
173
+
174
+ def DBSCAN(* args)
175
+ clusterer = Clusterer.new(*args)
176
+ clusterer.labeled_results
177
+ end
178
+ end
179
+
180
+ include DBSCAN
@@ -0,0 +1,40 @@
1
+ # lib/redis_geodata.rb
2
+ require 'redis'
3
+
4
+ module RedisGeodata
5
+ VERSION='0.0.1'
6
+ attr_accessor :redis
7
+
8
+ class RedisGeodata
9
+ attr_accessor :stops, :key, :redis, :epsilon
10
+
11
+ def initialize(stops, epsilon)
12
+ @redis = Redis.new(url: 'redis://127.0.0.1:6379')
13
+ @stops = stops
14
+ @key = 'stops'
15
+ @epsilon = epsilon
16
+ geoadd
17
+ end
18
+
19
+ def geoadd
20
+ @redis.geoadd(@key, *@stops)
21
+ @redis.expire(@key, 100_000_0)
22
+ end
23
+
24
+ def geosearch(longitude, latitude)
25
+ list = @redis.georadius(@key, longitude, latitude, @epsilon, 'km')
26
+ list.reject! { |point| point == longitude.to_s + "," + latitude.to_s }
27
+ list
28
+ end
29
+ end
30
+
31
+ def redis_geodata_import(*args)
32
+ @redis = RedisGeodata.new(*args)
33
+ end
34
+
35
+ def geosearch(*args)
36
+ @redis.geosearch(*args)
37
+ end
38
+ end
39
+
40
+ include RedisGeodata
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module GtfsStopsClustering
4
+ VERSION = "0.1.0"
5
+ end
@@ -0,0 +1,69 @@
1
+ # lib/gtfs_stops_clustering.rb
2
+
3
+ require 'gtfs'
4
+ require 'csv'
5
+ require_relative './gtfs_stops_clustering/data_import'
6
+ require_relative './gtfs_stops_clustering/dbscan'
7
+
8
+ module GtfsStopsClustering
9
+ VERSION='0.0.1'
10
+ attr_accessor :gtfs_stops_clustering
11
+
12
+ class GtfsStopsClustering
13
+ attr_accessor :clusters, :gtfs_urls, :gtfs_stops, :stops_config_path, :epsilon, :min_points, :names_similarity
14
+
15
+ def initialize(gtfs_urls, epsilon, min_points, names_similarity, stops_config_path)
16
+ @clusters = []
17
+ unless gtfs_urls.empty?
18
+ @gtfs_paths = gtfs_urls
19
+ @stops_config_path = stops_config_path
20
+ @epsilon = epsilon
21
+ @min_points = min_points
22
+ @names_similarity = names_similarity
23
+ @gtfs_stops = create_stops_merged
24
+ clusterize_stops_csv(@gtfs_stops)
25
+ end
26
+ end
27
+
28
+ def create_stops_merged
29
+ gtfs_stops = []
30
+ @gtfs_paths.each do |gtfs_path|
31
+ gtfs = GTFS::Source.build(gtfs_path)
32
+ gtfs_stops << gtfs.stops
33
+ end
34
+ gtfs_stops.flatten
35
+ end
36
+
37
+ def clusterize_stops_csv(stops_merged)
38
+ data = import_stops_data(stops_merged, @stops_config_path)
39
+ @clusters = DBSCAN( data[:stops_data], data[:stops_redis_geodata], :epsilon => @epsilon, :min_points => @min_points, :similarity => @names_similarity, :distance => :haversine_distance2, :labels => data[:stops_names] )
40
+
41
+ @clusters.each do |cluster_id, cluster|
42
+ cluster.each do |stop|
43
+ gtfs_stop = @gtfs_stops.find { |e| e.lat == stop[:stop_lat] && e.lon == stop[:stop_lon] }
44
+ stop[:stop_id] = gtfs_stop.id
45
+ stop[:stop_code] = gtfs_stop.code
46
+ stop[:parent_station] = gtfs_stop.parent_station
47
+ end
48
+ end
49
+
50
+ output_path = 'stop_clusters.txt'
51
+ File.open(output_path, 'w') do |file|
52
+ @clusters.each do |cluster_id, cluster |
53
+ file.puts "Cluster #{cluster_id}"
54
+ cluster.each do |point|
55
+ file.puts point.inspect
56
+ end
57
+ file.puts
58
+ end
59
+ end
60
+ end
61
+ end
62
+
63
+ def gtfs_stops_clusters(gtfs_urls, epsilon, min_points, names_similarity = 1, stop_config_path = '')
64
+ @gtfs_stops_clustering = GtfsStopsClustering.new(gtfs_urls, epsilon, min_points, names_similarity, stop_config_path)
65
+ @gtfs_stops_clustering.clusters
66
+ end
67
+ end
68
+
69
+ include GtfsStopsClustering
metadata ADDED
@@ -0,0 +1,158 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: gtfs_stops_clustering
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.1
5
+ platform: ruby
6
+ authors:
7
+ - Visco01
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2023-12-06 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: distance_measures
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: 0.0.6
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: 0.0.6
27
+ - !ruby/object:Gem::Dependency
28
+ name: text
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '1.3'
34
+ - - ">="
35
+ - !ruby/object:Gem::Version
36
+ version: 1.3.1
37
+ type: :runtime
38
+ prerelease: false
39
+ version_requirements: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - "~>"
42
+ - !ruby/object:Gem::Version
43
+ version: '1.3'
44
+ - - ">="
45
+ - !ruby/object:Gem::Version
46
+ version: 1.3.1
47
+ - !ruby/object:Gem::Dependency
48
+ name: geocoder
49
+ requirement: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - "~>"
52
+ - !ruby/object:Gem::Version
53
+ version: '1.8'
54
+ - - ">="
55
+ - !ruby/object:Gem::Version
56
+ version: 1.8.2
57
+ type: :runtime
58
+ prerelease: false
59
+ version_requirements: !ruby/object:Gem::Requirement
60
+ requirements:
61
+ - - "~>"
62
+ - !ruby/object:Gem::Version
63
+ version: '1.8'
64
+ - - ">="
65
+ - !ruby/object:Gem::Version
66
+ version: 1.8.2
67
+ - !ruby/object:Gem::Dependency
68
+ name: csv
69
+ requirement: !ruby/object:Gem::Requirement
70
+ requirements:
71
+ - - "~>"
72
+ - !ruby/object:Gem::Version
73
+ version: '3.2'
74
+ - - ">="
75
+ - !ruby/object:Gem::Version
76
+ version: 3.2.8
77
+ type: :runtime
78
+ prerelease: false
79
+ version_requirements: !ruby/object:Gem::Requirement
80
+ requirements:
81
+ - - "~>"
82
+ - !ruby/object:Gem::Version
83
+ version: '3.2'
84
+ - - ">="
85
+ - !ruby/object:Gem::Version
86
+ version: 3.2.8
87
+ - !ruby/object:Gem::Dependency
88
+ name: redis
89
+ requirement: !ruby/object:Gem::Requirement
90
+ requirements:
91
+ - - "~>"
92
+ - !ruby/object:Gem::Version
93
+ version: '5.0'
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: 5.0.8
97
+ type: :runtime
98
+ prerelease: false
99
+ version_requirements: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - "~>"
102
+ - !ruby/object:Gem::Version
103
+ version: '5.0'
104
+ - - ">="
105
+ - !ruby/object:Gem::Version
106
+ version: 5.0.8
107
+ - !ruby/object:Gem::Dependency
108
+ name: gtfs
109
+ requirement: !ruby/object:Gem::Requirement
110
+ requirements:
111
+ - - "~>"
112
+ - !ruby/object:Gem::Version
113
+ version: 0.4.1
114
+ type: :runtime
115
+ prerelease: false
116
+ version_requirements: !ruby/object:Gem::Requirement
117
+ requirements:
118
+ - - "~>"
119
+ - !ruby/object:Gem::Version
120
+ version: 0.4.1
121
+ description: A gem to read GTFS stops data and create clusters based on coordinates
122
+ and stop names' similarities.
123
+ email:
124
+ - visconti373@gmail.com
125
+ executables: []
126
+ extensions: []
127
+ extra_rdoc_files: []
128
+ files:
129
+ - lib/gtfs_stops_clustering.rb
130
+ - lib/gtfs_stops_clustering/data_import.rb
131
+ - lib/gtfs_stops_clustering/dbscan.rb
132
+ - lib/gtfs_stops_clustering/redis_geodata.rb
133
+ - lib/gtfs_stops_clustering/version.rb
134
+ homepage:
135
+ licenses:
136
+ - MIT
137
+ metadata: {}
138
+ post_install_message:
139
+ rdoc_options: []
140
+ require_paths:
141
+ - lib
142
+ required_ruby_version: !ruby/object:Gem::Requirement
143
+ requirements:
144
+ - - ">="
145
+ - !ruby/object:Gem::Version
146
+ version: 2.6.0
147
+ required_rubygems_version: !ruby/object:Gem::Requirement
148
+ requirements:
149
+ - - ">="
150
+ - !ruby/object:Gem::Version
151
+ version: '0'
152
+ requirements: []
153
+ rubygems_version: 3.4.10
154
+ signing_key:
155
+ specification_version: 4
156
+ summary: A gem to read GTFS stops data and create clusters based on coordinates and
157
+ stop names' similarities.
158
+ test_files: []