udise_school_report_reader 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +20 -0
  3. data/LICENSE.txt +21 -0
  4. data/README.md +45 -0
  5. data/lib/udise_school_report_reader/activities_data_reader.rb +58 -0
  6. data/lib/udise_school_report_reader/anganwadi_data_reader.rb +22 -0
  7. data/lib/udise_school_report_reader/basic_info_data_reader.rb +29 -0
  8. data/lib/udise_school_report_reader/block_rectangle_combiner.rb +115 -0
  9. data/lib/udise_school_report_reader/building_data_reader.rb +36 -0
  10. data/lib/udise_school_report_reader/characteristics_reader.rb +28 -0
  11. data/lib/udise_school_report_reader/csv_writer.rb +75 -0
  12. data/lib/udise_school_report_reader/data_reader_base.rb +86 -0
  13. data/lib/udise_school_report_reader/digital_facilities_data_reader.rb +42 -0
  14. data/lib/udise_school_report_reader/enrollment_data_reader.rb +136 -0
  15. data/lib/udise_school_report_reader/enrollment_html_writer.rb +81 -0
  16. data/lib/udise_school_report_reader/enrollment_yaml_writer.rb +62 -0
  17. data/lib/udise_school_report_reader/ews_data_reader.rb +118 -0
  18. data/lib/udise_school_report_reader/ews_html_writer.rb +63 -0
  19. data/lib/udise_school_report_reader/ews_yaml_writer.rb +31 -0
  20. data/lib/udise_school_report_reader/location_data_reader.rb +47 -0
  21. data/lib/udise_school_report_reader/official_data_reader.rb +40 -0
  22. data/lib/udise_school_report_reader/pdf_block_extractor.rb +49 -0
  23. data/lib/udise_school_report_reader/pdf_content_compressor.rb +36 -0
  24. data/lib/udise_school_report_reader/pdf_rectangle_extractor.rb +53 -0
  25. data/lib/udise_school_report_reader/rooms_data_reader.rb +36 -0
  26. data/lib/udise_school_report_reader/rte_data_reader.rb +118 -0
  27. data/lib/udise_school_report_reader/rte_html_writer.rb +63 -0
  28. data/lib/udise_school_report_reader/rte_yaml_writer.rb +61 -0
  29. data/lib/udise_school_report_reader/sanitation_data_reader.rb +56 -0
  30. data/lib/udise_school_report_reader/school_report_parser.rb +295 -0
  31. data/lib/udise_school_report_reader/teacher_data_reader.rb +204 -0
  32. data/lib/udise_school_report_reader/version.rb +3 -0
  33. data/lib/udise_school_report_reader.rb +41 -0
  34. data/test/school_report_parser_test.rb +62 -0
  35. metadata +165 -0
@@ -0,0 +1,136 @@
1
+ class EnrollmentDataReader
2
+ ALL_CATEGORIES = [
3
+ SOCIAL_CATEGORIES = [
4
+ { key: 'gen', label: 'Gen' },
5
+ { key: 'sc', label: 'SC' },
6
+ { key: 'st', label: 'ST' },
7
+ { key: 'obc', label: 'OBC' }
8
+ ].freeze,
9
+ RELIGION_CATEGORIES = [
10
+ { key: 'musl', label: 'Musl' },
11
+ { key: 'chris', label: 'Chris' },
12
+ { key: 'sikh', label: 'Sikh' },
13
+ { key: 'budd', label: 'Budd' },
14
+ { key: 'parsi', label: 'Parsi' },
15
+ { key: 'jain', label: 'Jain' },
16
+ { key: 'others', label: 'Others' }
17
+ ].freeze,
18
+ OTHER_CATEGORIES = [
19
+ { key: 'aadh', label: 'Aadh' },
20
+ { key: 'bpl', label: 'BPL' },
21
+ { key: 'rept', label: 'Rept' },
22
+ { key: 'cwsn', label: 'CWSN' }
23
+ ].freeze,
24
+ AGE_CATEGORIES = (3..22).map do |age|
25
+ { key: "age_#{age}", label: age == 3 ? '>3' : age.to_s }
26
+ end.freeze,
27
+ ].flatten.freeze
28
+
29
+ def self.read(csv_path) = new(csv_path).read
30
+
31
+ def initialize(csv_path)
32
+ @csv_path = csv_path
33
+ @x_cutoff = 0
34
+ @category_y_coords = {}
35
+ end
36
+
37
+ def read
38
+ # Initialize arrays for different row types
39
+ grade_rows = []
40
+ bg_rows = []
41
+ category_rows = {}
42
+
43
+ ALL_CATEGORIES.each do |category|
44
+ category_rows[category[:key]] = []
45
+ end
46
+
47
+ # First pass to collect y-coordinates for categories
48
+ CSV.foreach(@csv_path, headers: true) do |row|
49
+ if row['page'] == '2' && (row['rect_x'].to_f - 27.0).abs < 5.0
50
+ ALL_CATEGORIES.each do |category|
51
+ if row['text'].downcase == category[:label].downcase
52
+ @category_y_coords[category[:key]] = row['rect_y'].to_f
53
+ end
54
+ end
55
+ end
56
+ end
57
+
58
+ CSV.foreach(@csv_path, headers: true) do |row|
59
+ if row['page'] == '2'
60
+ if row['text'] == "Total" && row['rect_y'].to_f == 778.0
61
+ @x_cutoff = row['rect_x'].to_f
62
+ end
63
+
64
+ if ['Pre-Pr', 'I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX', 'X', 'XI', 'XII'].include?(row['text'])
65
+ if row['text_y'].to_f == 780.0
66
+ grade_rows << row
67
+ end
68
+ elsif ['B', 'G'].include?(row['text'])
69
+ if row['text_y'].to_f == 768.0
70
+ bg_rows << row
71
+ end
72
+ elsif row['text'] =~ /^\d+$/
73
+ y_coord = row['rect_y'].to_f
74
+ ALL_CATEGORIES.each do |category|
75
+ if @category_y_coords[category[:key]] && (y_coord - @category_y_coords[category[:key]]).abs < 5.0
76
+ category_rows[category[:key]] << row
77
+ end
78
+ end
79
+ end
80
+ end
81
+ end
82
+
83
+ return nil if grade_rows.empty?
84
+
85
+ # Sort and filter rows
86
+ [grade_rows, bg_rows].each do |rows|
87
+ rows.sort_by! { |row| row['text_x'].to_f }
88
+ rows.reject! { |row| row['text_x'].to_f >= @x_cutoff }
89
+ end
90
+
91
+ category_rows.values.each do |rows|
92
+ rows.sort_by! { |row| row['text_x'].to_f }
93
+ rows.reject! { |row| row['text_x'].to_f >= @x_cutoff }
94
+ end
95
+
96
+ # Group B,G pairs
97
+ bg_pairs = bg_rows.each_slice(2).map do |b, g|
98
+ x_mid = (b['text_x'].to_f + g['text_x'].to_f) / 2
99
+ [x_mid, [b, g]]
100
+ end.to_h
101
+
102
+ # Match numbers to pairs
103
+ result = {
104
+ grade_rows: grade_rows,
105
+ bg_pairs: bg_pairs
106
+ }
107
+
108
+ ALL_CATEGORIES.each do |category|
109
+ result["#{category[:key]}_numbers".to_sym] = match_numbers_to_pairs(category_rows[category[:key]], bg_pairs)
110
+ end
111
+
112
+ result
113
+ end
114
+
115
+ private
116
+ def match_numbers_to_pairs(remaining_numbers, bg_pairs, threshold = 10.0)
117
+ numbers = {}
118
+ remaining = remaining_numbers.dup
119
+
120
+ bg_pairs.each do |x_mid, bg_pair|
121
+ b_x = bg_pair[0]['text_x'].to_f
122
+ g_x = bg_pair[1]['text_x'].to_f
123
+
124
+ # Find numbers closest to B and G positions
125
+ b_num = remaining.find { |row| (row['text_x'].to_f - b_x).abs < threshold }
126
+ remaining.delete(b_num) if b_num
127
+
128
+ g_num = remaining.find { |row| (row['text_x'].to_f - g_x).abs < threshold }
129
+ remaining.delete(g_num) if g_num
130
+
131
+ numbers[x_mid] = [b_num, g_num]
132
+ end
133
+
134
+ numbers
135
+ end
136
+ end
@@ -0,0 +1,81 @@
1
+ class EnrollmentHtmlWriter
2
+ def self.generate_html(data, html_path)
3
+ return unless data
4
+
5
+ grade_rows = data[:grade_rows]
6
+ bg_pairs = data[:bg_pairs]
7
+
8
+ categories = [
9
+ ['Gen', data[:gen_numbers]],
10
+ ['SC', data[:sc_numbers]],
11
+ ['ST', data[:st_numbers]],
12
+ ['OBC', data[:obc_numbers]],
13
+ ['Muslim', data[:musl_numbers]],
14
+ ['Christian', data[:chris_numbers]],
15
+ ['Sikh', data[:sikh_numbers]],
16
+ ['Buddhist', data[:budd_numbers]],
17
+ ['Parsi', data[:parsi_numbers]],
18
+ ['Jain', data[:jain_numbers]],
19
+ ['Others', data[:others_numbers]],
20
+ ['Aadhaar', data[:aadh_numbers]],
21
+ ['BPL', data[:bpl_numbers]],
22
+ ['Repeater', data[:rept_numbers]],
23
+ ['CWSN', data[:cwsn_numbers]]
24
+ ]
25
+
26
+ ages = (3..22).map do |age|
27
+ ["Age #{age}", data[:"age_#{age}_numbers"]]
28
+ end
29
+
30
+ # Generate table rows for all categories and ages
31
+ table_rows = (categories + ages).map do |category, numbers|
32
+ cells = bg_pairs.map do |x_mid, _|
33
+ nums = numbers[x_mid]
34
+ b_num = nums&.first
35
+ g_num = nums&.last
36
+ "<td>#{b_num ? b_num['text'] : ''}</td><td>#{g_num ? g_num['text'] : ''}</td>"
37
+ end.join
38
+
39
+ " <tr>\n" \
40
+ " <td class=\"category\">#{category}</td>\n" \
41
+ " #{cells}\n" \
42
+ " </tr>"
43
+ end.join("\n")
44
+
45
+ # Generate grade headers
46
+ grade_headers = grade_rows.map { |row| "<th colspan='2'>#{row['text']}</th>" }.join
47
+ bg_headers = grade_rows.map { |_| "<td>B</td><td>G</td>" }.join
48
+
49
+ html_content = <<~HTML
50
+ <!DOCTYPE html>
51
+ <html>
52
+ <head>
53
+ <title>Enrollment Table</title>
54
+ <style>
55
+ table { border-collapse: collapse; margin-top: 20px; width: 100%; }
56
+ th, td { border: 1px solid black; padding: 8px; text-align: center; }
57
+ .header { font-weight: bold; background-color: #f0f0f0; }
58
+ .grade { font-weight: bold; background-color: #e0e0e0; }
59
+ .bg-pair { background-color: #f8f8f8; }
60
+ .category { font-weight: bold; text-align: left; }
61
+ </style>
62
+ </head>
63
+ <body>
64
+ <h2>Enrolment (By Social Category)</h2>
65
+ <table>
66
+ <tr class="grade">
67
+ <th rowspan="2">Category</th>
68
+ #{grade_headers}
69
+ </tr>
70
+ <tr class="bg-pair">
71
+ #{bg_headers}
72
+ </tr>
73
+ #{table_rows}
74
+ </table>
75
+ </body>
76
+ </html>
77
+ HTML
78
+
79
+ File.write(html_path, html_content)
80
+ end
81
+ end
@@ -0,0 +1,62 @@
1
+ class EnrollmentYamlWriter
2
+ def self.format_yaml(data)
3
+ return unless data
4
+
5
+ grade_rows = data[:grade_rows]
6
+ bg_pairs = data[:bg_pairs]
7
+
8
+ categories = {
9
+ 'gen' => data[:gen_numbers],
10
+ 'sc' => data[:sc_numbers],
11
+ 'st' => data[:st_numbers],
12
+ 'obc' => data[:obc_numbers],
13
+ 'muslim' => data[:musl_numbers],
14
+ 'christian' => data[:chris_numbers],
15
+ 'sikh' => data[:sikh_numbers],
16
+ 'buddhist' => data[:budd_numbers],
17
+ 'parsi' => data[:parsi_numbers],
18
+ 'jain' => data[:jain_numbers],
19
+ 'others' => data[:others_numbers],
20
+ 'aadhaar' => data[:aadh_numbers],
21
+ 'bpl' => data[:bpl_numbers],
22
+ 'repeater' => data[:rept_numbers],
23
+ 'cwsn' => data[:cwsn_numbers]
24
+ }
25
+
26
+ yaml_data = {}
27
+
28
+ categories.each do |category, numbers|
29
+ yaml_data[category] = {}
30
+ bg_pairs.each_with_index do |(x_mid, _), index|
31
+ next unless grade_rows[index] && grade_rows[index]['text']
32
+ grade_name = grade_rows[index]['text'].downcase.gsub(' ', '_')
33
+ nums = numbers&.[](x_mid)
34
+ boys_text = nums&.first&.[]('text')&.strip
35
+ girls_text = nums&.last&.[]('text')&.strip
36
+ yaml_data[category][grade_name] = {
37
+ 'boys' => boys_text.nil? || boys_text.empty? ? nil : boys_text.to_i,
38
+ 'girls' => girls_text.nil? || girls_text.empty? ? nil : girls_text.to_i
39
+ }
40
+ end
41
+ end
42
+
43
+ # Handle age data
44
+ (3..22).each do |age|
45
+ age_numbers = data[:"age_#{age}_numbers"]
46
+ yaml_data["age_#{age}"] = {}
47
+ bg_pairs.each_with_index do |(x_mid, _), index|
48
+ next unless grade_rows[index] && grade_rows[index]['text']
49
+ grade_name = grade_rows[index]['text'].downcase.gsub(' ', '_')
50
+ nums = age_numbers&.[](x_mid)
51
+ boys_text = nums&.first&.[]('text')&.strip
52
+ girls_text = nums&.last&.[]('text')&.strip
53
+ yaml_data["age_#{age}"][grade_name] = {
54
+ 'boys' => boys_text.nil? || boys_text.empty? ? nil : boys_text.to_i,
55
+ 'girls' => girls_text.nil? || girls_text.empty? ? nil : girls_text.to_i
56
+ }
57
+ end
58
+ end
59
+
60
+ yaml_data
61
+ end
62
+ end
@@ -0,0 +1,118 @@
1
+ class EwsDataReader
2
+ GRADES = [
3
+ 'Pre-Pri.', 'Class I', 'Class II', 'Class III', 'Class IV', 'Class V',
4
+ 'Class VI', 'Class VII', 'Class VIII', 'Class IX', 'Class X', 'Class XI', 'Class XII'
5
+ ]
6
+
7
+ def self.read(csv_path) = new(csv_path).read
8
+
9
+ def initialize(csv_path)
10
+ @csv_path = csv_path
11
+ @rows = Hash.new { |h, k| h[k] = [] }
12
+
13
+ # Group cells by rect_y and rect_x
14
+ CSV.foreach(@csv_path, headers: true) do |cell|
15
+ next unless cell['page'] == '1'
16
+
17
+ rect_y = cell['rect_y'].to_f
18
+ @rows[rect_y] << cell
19
+ end
20
+
21
+ # Find the title row
22
+ @title_row = @rows.find { |_, cells| cells.any? { |cell| cell&.dig('text')&.include?('Total no. of Economically Weaker Section*(EWS) students Enrolled in Schools') } }
23
+
24
+ title_y = @title_row&.first
25
+ return unless title_y
26
+
27
+ # Get all rows below title in descending order
28
+ rows_after_title = @rows.select { |y, _| y < title_y.to_f }
29
+ .sort_by(&:first)
30
+ .reverse
31
+
32
+ # Get the next 3 rows after title
33
+ return unless rows_after_title.size >= 3
34
+
35
+ @grades_row = rows_after_title[0].last
36
+ @bg_row = rows_after_title[1].last
37
+ @values_row = rows_after_title[2].last
38
+
39
+ # Sort cells within each row by x coordinate
40
+ [@grades_row, @bg_row].each do |row|
41
+ next unless row
42
+ row.sort_by! { |cell| cell['text_x'].to_f }
43
+ end
44
+
45
+ # For values row, ensure we have a value for each B/G pair
46
+ if @values_row && @bg_row
47
+ sorted_values = []
48
+ @bg_row.each_slice(2) do |b, g|
49
+ b_x = b['text_x'].to_f
50
+ g_x = g['text_x'].to_f
51
+
52
+ # Find or create value for boys
53
+ b_val = @values_row.find { |cell| (cell['text_x'].to_f - b_x).abs < 10.0 }
54
+ b_val ||= { 'text' => '-', 'text_x' => b_x }
55
+ sorted_values << b_val
56
+
57
+ # Find or create value for girls
58
+ g_val = @values_row.find { |cell| (cell['text_x'].to_f - g_x).abs < 10.0 }
59
+ g_val ||= { 'text' => '-', 'text_x' => g_x }
60
+ sorted_values << g_val
61
+ end
62
+ @values_row = sorted_values
63
+ end
64
+
65
+ # Normalize empty values to "-"
66
+ @values_row&.each { |cell| cell['text'] = '-' if cell['text'].strip.empty? }
67
+
68
+ # Ensure we have all grades
69
+ found_grades = @grades_row.map { |cell| cell['text'] }
70
+ missing_grades = GRADES - found_grades
71
+ if missing_grades.any?
72
+ # Removed puts statement
73
+ end
74
+ end
75
+
76
+ def read
77
+ return nil unless @grades_row && @bg_row && @values_row
78
+
79
+ # Group B,G pairs, ensuring we have complete pairs
80
+ bg_pairs = {}
81
+ @bg_row.each_slice(2) do |pair|
82
+ next unless pair.size == 2 && pair[0] && pair[1] # Skip incomplete pairs
83
+ b, g = pair
84
+ x_mid = (b['text_x'].to_f + g['text_x'].to_f) / 2
85
+ bg_pairs[x_mid] = [b, g]
86
+ end
87
+
88
+ # Match numbers to pairs
89
+ {
90
+ grade_rows: @grades_row,
91
+ bg_pairs: bg_pairs,
92
+ ews_numbers: match_numbers_to_pairs(@values_row, bg_pairs),
93
+ }
94
+ end
95
+
96
+ private
97
+ def match_numbers_to_pairs(remaining_numbers, bg_pairs, threshold = 10.0)
98
+ numbers = {}
99
+ remaining = remaining_numbers.dup
100
+
101
+ bg_pairs.each do |x_mid, bg_pair|
102
+ next unless bg_pair && bg_pair.size == 2 # Skip invalid pairs
103
+ b_x = bg_pair[0]['text_x'].to_f
104
+ g_x = bg_pair[1]['text_x'].to_f
105
+
106
+ # Find numbers closest to B and G positions
107
+ b_num = remaining.find { |cell| (cell['text_x'].to_f - b_x).abs < threshold }
108
+ remaining.delete(b_num) if b_num
109
+
110
+ g_num = remaining.find { |cell| (cell['text_x'].to_f - g_x).abs < threshold }
111
+ remaining.delete(g_num) if g_num
112
+
113
+ numbers[x_mid] = [b_num, g_num]
114
+ end
115
+
116
+ numbers
117
+ end
118
+ end
@@ -0,0 +1,63 @@
1
+ class EwsHtmlWriter
2
+ def self.generate_html(data, html_path)
3
+ return unless data
4
+
5
+ grade_rows = data[:grade_rows]
6
+ bg_pairs = data[:bg_pairs]
7
+
8
+ categories = [
9
+ ['EWS', data[:ews_numbers]],
10
+ ]
11
+
12
+ # Generate table rows for all categories
13
+ table_rows = categories.map do |category, numbers|
14
+ cells = bg_pairs.map do |x_mid, _|
15
+ nums = numbers[x_mid]
16
+ b_num = nums&.first
17
+ g_num = nums&.last
18
+ "<td>#{b_num ? b_num['text'] : ''}</td><td>#{g_num ? g_num['text'] : ''}</td>"
19
+ end.join
20
+
21
+ " <tr>\n" \
22
+ " <td class=\"category\">#{category}</td>\n" \
23
+ " #{cells}\n" \
24
+ " </tr>"
25
+ end.join("\n")
26
+
27
+ # Generate grade headers
28
+ grade_headers = grade_rows.map { |row| "<th colspan='2'>#{row['text']}</th>" }.join
29
+ bg_headers = grade_rows.map { |_| "<td>B</td><td>G</td>" }.join
30
+
31
+ html_content = <<~HTML
32
+ <!DOCTYPE html>
33
+ <html>
34
+ <head>
35
+ <title>Enrollment Table</title>
36
+ <style>
37
+ table { border-collapse: collapse; margin-top: 20px; width: 100%; }
38
+ th, td { border: 1px solid black; padding: 8px; text-align: center; }
39
+ .header { font-weight: bold; background-color: #f0f0f0; }
40
+ .grade { font-weight: bold; background-color: #e0e0e0; }
41
+ .bg-pair { background-color: #f8f8f8; }
42
+ .category { font-weight: bold; text-align: left; }
43
+ </style>
44
+ </head>
45
+ <body>
46
+ <h2>Enrolment (By Social Category)</h2>
47
+ <table>
48
+ <tr class="grade">
49
+ <th rowspan="2">Category</th>
50
+ #{grade_headers}
51
+ </tr>
52
+ <tr class="bg-pair">
53
+ #{bg_headers}
54
+ </tr>
55
+ #{table_rows}
56
+ </table>
57
+ </body>
58
+ </html>
59
+ HTML
60
+
61
+ File.write(html_path, html_content)
62
+ end
63
+ end
@@ -0,0 +1,31 @@
1
+ class EwsYamlWriter
2
+ def self.format_yaml(data)
3
+ return unless data
4
+
5
+ grade_rows = data[:grade_rows]
6
+ bg_pairs = data[:bg_pairs]
7
+
8
+ categories = {
9
+ 'ews' => data[:ews_numbers],
10
+ }
11
+
12
+ yaml_data = {}
13
+
14
+ categories.each do |category, numbers|
15
+ yaml_data[category] = {}
16
+ bg_pairs.each_with_index do |(x_mid, _), index|
17
+ next unless grade_rows[index] && grade_rows[index]['text']
18
+ grade_name = grade_rows[index]['text'].downcase.gsub(' ', '_')
19
+ nums = numbers&.[](x_mid)
20
+ boys_text = nums&.first&.[]('text')&.strip
21
+ girls_text = nums&.last&.[]('text')&.strip
22
+ yaml_data[category][grade_name] = {
23
+ 'boys' => boys_text.nil? || boys_text.empty? ? nil : boys_text.to_i,
24
+ 'girls' => girls_text.nil? || girls_text.empty? ? nil : girls_text.to_i
25
+ }
26
+ end
27
+ end
28
+
29
+ yaml_data
30
+ end
31
+ end
@@ -0,0 +1,47 @@
1
+ require_relative 'data_reader_base'
2
+
3
+ class LocationDataReader
4
+ include DataReaderBase
5
+
6
+ FIELD_MAPPINGS = {
7
+ 'State' => {
8
+ key_path: ['location', 'state'],
9
+ end_pattern: /District/
10
+ },
11
+ 'District' => {
12
+ key_path: ['location', 'district'],
13
+ end_pattern: /Block/
14
+ },
15
+ 'Block' => {
16
+ key_path: ['location', 'block'],
17
+ end_pattern: /Rural/
18
+ },
19
+ 'Rural / Urban' => {
20
+ key_path: ['location', 'area_type'],
21
+ end_pattern: /Cluster/
22
+ },
23
+ 'Pincode' => {
24
+ key_path: ['location', 'pincode']
25
+ },
26
+ 'Ward' => {
27
+ key_path: ['location', 'ward'],
28
+ end_pattern: /Mohalla/
29
+ },
30
+ 'Cluster' => {
31
+ key_path: ['location', 'cluster'],
32
+ end_pattern: /Ward/
33
+ },
34
+ 'Municipality' => {
35
+ key_path: ['location', 'municipality'],
36
+ end_pattern: /Assembly/
37
+ },
38
+ 'Assembly Const.' => {
39
+ key_path: ['location', 'assembly_constituency'],
40
+ end_pattern: /Parl/
41
+ },
42
+ 'Parl. Constituency' => {
43
+ key_path: ['location', 'parliamentary_constituency'],
44
+ end_pattern: /School/
45
+ }
46
+ }
47
+ end
@@ -0,0 +1,40 @@
1
+ require_relative 'data_reader_base'
2
+
3
+ class OfficialDataReader
4
+ include DataReaderBase
5
+
6
+ FIELD_MAPPINGS = {
7
+ 'Year of Establishment' => {
8
+ key_path: ['official', 'established'],
9
+ value_type: :integer
10
+ },
11
+ 'Year of Recognition-Pri.' => {
12
+ key_path: ['official', 'recognition', 'primary'],
13
+ value_type: :integer
14
+ },
15
+ 'Year of Recognition-Upr.Pri.' => {
16
+ key_path: ['official', 'recognition', 'upper_primary'],
17
+ value_type: :integer
18
+ },
19
+ 'Year of Recognition-Sec.' => {
20
+ key_path: ['official', 'recognition', 'secondary'],
21
+ value_type: :integer
22
+ },
23
+ 'Year of Recognition-Higher Sec.' => {
24
+ key_path: ['official', 'recognition', 'higher_secondary'],
25
+ value_type: :integer
26
+ },
27
+ 'Affiliation Board-Sec' => {
28
+ key_path: ['official', 'affiliation', 'secondary'],
29
+ end_pattern: /Affiliation Board-HSec/
30
+ },
31
+ 'Affiliation Board-HSec' => {
32
+ key_path: ['official', 'affiliation', 'higher_secondary'],
33
+ end_pattern: /Is this/
34
+ },
35
+ 'School Management' => {
36
+ key_path: ['official', 'management'],
37
+ end_pattern: /School Type/
38
+ }
39
+ }
40
+ end
@@ -0,0 +1,49 @@
1
+ class PDFBlockExtractor
2
+ def self.extract_blocks(reader)
3
+ blocks = []
4
+
5
+ reader.pages.each_with_index do |page, index|
6
+ page_number = index + 1
7
+ current_block = {}
8
+
9
+ page.raw_content.each_line do |line|
10
+ if line.include?('BT')
11
+ current_block = {
12
+ page: page_number,
13
+ start_line: line.strip,
14
+ text: [] # Initialize as array to collect multiple text blocks
15
+ }
16
+ elsif line.match?(/1\s+0\s+0\s+1\s+(\d+\.?\d*)\s+(\d+\.?\d*)\s+Tm/)
17
+ # Only set coordinates if not already set
18
+ unless current_block[:x] && current_block[:y]
19
+ matches = line.match(/1\s+0\s+0\s+1\s+(\d+\.?\d*)\s+(\d+\.?\d*)\s+Tm/)
20
+ current_block[:x] = matches[1].to_f
21
+ current_block[:y] = matches[2].to_f
22
+ end
23
+ elsif line.match?(/\/F(\d+)\s+(\d+(\.\d+)?)\s+Tf/)
24
+ # Only set font if not already set
25
+ unless current_block[:font] && current_block[:font_size]
26
+ matches = line.match(/\/F(\d+)\s+(\d+(\.\d+)?)\s+Tf/)
27
+ current_block[:font] = "F#{matches[1]}"
28
+ current_block[:font_size] = matches[2].to_f
29
+ end
30
+ elsif line.match?(/\((.*?)\)\s*Tj/)
31
+ # Collect all text blocks, remove escape characters
32
+ text = line.match(/\((.*?)\)\s*Tj/)[1]
33
+ text = text.gsub(/\\/, '') # Remove escape characters
34
+ current_block[:text] << text
35
+ elsif line.include?('ET')
36
+ current_block[:end_line] = line.strip
37
+ # Join all text blocks with space
38
+ current_block[:text] = current_block[:text].join(' ')
39
+ # Only add non-empty blocks with coordinates
40
+ if !current_block[:text].empty? && current_block[:x] && current_block[:y]
41
+ blocks << current_block.dup
42
+ end
43
+ end
44
+ end
45
+ end
46
+
47
+ blocks
48
+ end
49
+ end
@@ -0,0 +1,36 @@
1
+ class PDFContentCompressor
2
+ def self.compress(content)
3
+ compressed = []
4
+ current_block = []
5
+ in_bt_block = false
6
+ current_text = ""
7
+
8
+ content.each_line do |line|
9
+ if line.include?('BT')
10
+ in_bt_block = true
11
+ current_block = []
12
+ current_text = ""
13
+ elsif line.include?('ET')
14
+ in_bt_block = false
15
+ current_text = current_block.join("")
16
+ compressed << current_text unless current_text.empty?
17
+ elsif in_bt_block && line =~ /\((.*?)\)\s*Tj/
18
+ # Extract text between (...) followed by Tj
19
+ text = $1.strip
20
+ if text =~ /^(?:Non|Residenti|al|Digit|al Facil|ities)$/
21
+ # Special handling for split text
22
+ current_text += text
23
+ current_block << text
24
+ else
25
+ if !current_text.empty?
26
+ compressed << current_text
27
+ end
28
+ current_text = text
29
+ current_block = [text]
30
+ end
31
+ end
32
+ end
33
+
34
+ compressed.reject(&:empty?).join("\n")
35
+ end
36
+ end