udise_school_report_reader 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (35) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +20 -0
  3. data/LICENSE.txt +21 -0
  4. data/README.md +45 -0
  5. data/lib/udise_school_report_reader/activities_data_reader.rb +58 -0
  6. data/lib/udise_school_report_reader/anganwadi_data_reader.rb +22 -0
  7. data/lib/udise_school_report_reader/basic_info_data_reader.rb +29 -0
  8. data/lib/udise_school_report_reader/block_rectangle_combiner.rb +115 -0
  9. data/lib/udise_school_report_reader/building_data_reader.rb +36 -0
  10. data/lib/udise_school_report_reader/characteristics_reader.rb +28 -0
  11. data/lib/udise_school_report_reader/csv_writer.rb +75 -0
  12. data/lib/udise_school_report_reader/data_reader_base.rb +86 -0
  13. data/lib/udise_school_report_reader/digital_facilities_data_reader.rb +42 -0
  14. data/lib/udise_school_report_reader/enrollment_data_reader.rb +136 -0
  15. data/lib/udise_school_report_reader/enrollment_html_writer.rb +81 -0
  16. data/lib/udise_school_report_reader/enrollment_yaml_writer.rb +62 -0
  17. data/lib/udise_school_report_reader/ews_data_reader.rb +118 -0
  18. data/lib/udise_school_report_reader/ews_html_writer.rb +63 -0
  19. data/lib/udise_school_report_reader/ews_yaml_writer.rb +31 -0
  20. data/lib/udise_school_report_reader/location_data_reader.rb +47 -0
  21. data/lib/udise_school_report_reader/official_data_reader.rb +40 -0
  22. data/lib/udise_school_report_reader/pdf_block_extractor.rb +49 -0
  23. data/lib/udise_school_report_reader/pdf_content_compressor.rb +36 -0
  24. data/lib/udise_school_report_reader/pdf_rectangle_extractor.rb +53 -0
  25. data/lib/udise_school_report_reader/rooms_data_reader.rb +36 -0
  26. data/lib/udise_school_report_reader/rte_data_reader.rb +118 -0
  27. data/lib/udise_school_report_reader/rte_html_writer.rb +63 -0
  28. data/lib/udise_school_report_reader/rte_yaml_writer.rb +61 -0
  29. data/lib/udise_school_report_reader/sanitation_data_reader.rb +56 -0
  30. data/lib/udise_school_report_reader/school_report_parser.rb +295 -0
  31. data/lib/udise_school_report_reader/teacher_data_reader.rb +204 -0
  32. data/lib/udise_school_report_reader/version.rb +3 -0
  33. data/lib/udise_school_report_reader.rb +41 -0
  34. data/test/school_report_parser_test.rb +62 -0
  35. metadata +165 -0
@@ -0,0 +1,136 @@
1
+ class EnrollmentDataReader
2
+ ALL_CATEGORIES = [
3
+ SOCIAL_CATEGORIES = [
4
+ { key: 'gen', label: 'Gen' },
5
+ { key: 'sc', label: 'SC' },
6
+ { key: 'st', label: 'ST' },
7
+ { key: 'obc', label: 'OBC' }
8
+ ].freeze,
9
+ RELIGION_CATEGORIES = [
10
+ { key: 'musl', label: 'Musl' },
11
+ { key: 'chris', label: 'Chris' },
12
+ { key: 'sikh', label: 'Sikh' },
13
+ { key: 'budd', label: 'Budd' },
14
+ { key: 'parsi', label: 'Parsi' },
15
+ { key: 'jain', label: 'Jain' },
16
+ { key: 'others', label: 'Others' }
17
+ ].freeze,
18
+ OTHER_CATEGORIES = [
19
+ { key: 'aadh', label: 'Aadh' },
20
+ { key: 'bpl', label: 'BPL' },
21
+ { key: 'rept', label: 'Rept' },
22
+ { key: 'cwsn', label: 'CWSN' }
23
+ ].freeze,
24
+ AGE_CATEGORIES = (3..22).map do |age|
25
+ { key: "age_#{age}", label: age == 3 ? '>3' : age.to_s }
26
+ end.freeze,
27
+ ].flatten.freeze
28
+
29
+ def self.read(csv_path) = new(csv_path).read
30
+
31
+ def initialize(csv_path)
32
+ @csv_path = csv_path
33
+ @x_cutoff = 0
34
+ @category_y_coords = {}
35
+ end
36
+
37
+ def read
38
+ # Initialize arrays for different row types
39
+ grade_rows = []
40
+ bg_rows = []
41
+ category_rows = {}
42
+
43
+ ALL_CATEGORIES.each do |category|
44
+ category_rows[category[:key]] = []
45
+ end
46
+
47
+ # First pass to collect y-coordinates for categories
48
+ CSV.foreach(@csv_path, headers: true) do |row|
49
+ if row['page'] == '2' && (row['rect_x'].to_f - 27.0).abs < 5.0
50
+ ALL_CATEGORIES.each do |category|
51
+ if row['text'].downcase == category[:label].downcase
52
+ @category_y_coords[category[:key]] = row['rect_y'].to_f
53
+ end
54
+ end
55
+ end
56
+ end
57
+
58
+ CSV.foreach(@csv_path, headers: true) do |row|
59
+ if row['page'] == '2'
60
+ if row['text'] == "Total" && row['rect_y'].to_f == 778.0
61
+ @x_cutoff = row['rect_x'].to_f
62
+ end
63
+
64
+ if ['Pre-Pr', 'I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX', 'X', 'XI', 'XII'].include?(row['text'])
65
+ if row['text_y'].to_f == 780.0
66
+ grade_rows << row
67
+ end
68
+ elsif ['B', 'G'].include?(row['text'])
69
+ if row['text_y'].to_f == 768.0
70
+ bg_rows << row
71
+ end
72
+ elsif row['text'] =~ /^\d+$/
73
+ y_coord = row['rect_y'].to_f
74
+ ALL_CATEGORIES.each do |category|
75
+ if @category_y_coords[category[:key]] && (y_coord - @category_y_coords[category[:key]]).abs < 5.0
76
+ category_rows[category[:key]] << row
77
+ end
78
+ end
79
+ end
80
+ end
81
+ end
82
+
83
+ return nil if grade_rows.empty?
84
+
85
+ # Sort and filter rows
86
+ [grade_rows, bg_rows].each do |rows|
87
+ rows.sort_by! { |row| row['text_x'].to_f }
88
+ rows.reject! { |row| row['text_x'].to_f >= @x_cutoff }
89
+ end
90
+
91
+ category_rows.values.each do |rows|
92
+ rows.sort_by! { |row| row['text_x'].to_f }
93
+ rows.reject! { |row| row['text_x'].to_f >= @x_cutoff }
94
+ end
95
+
96
+ # Group B,G pairs
97
+ bg_pairs = bg_rows.each_slice(2).map do |b, g|
98
+ x_mid = (b['text_x'].to_f + g['text_x'].to_f) / 2
99
+ [x_mid, [b, g]]
100
+ end.to_h
101
+
102
+ # Match numbers to pairs
103
+ result = {
104
+ grade_rows: grade_rows,
105
+ bg_pairs: bg_pairs
106
+ }
107
+
108
+ ALL_CATEGORIES.each do |category|
109
+ result["#{category[:key]}_numbers".to_sym] = match_numbers_to_pairs(category_rows[category[:key]], bg_pairs)
110
+ end
111
+
112
+ result
113
+ end
114
+
115
+ private
116
+ def match_numbers_to_pairs(remaining_numbers, bg_pairs, threshold = 10.0)
117
+ numbers = {}
118
+ remaining = remaining_numbers.dup
119
+
120
+ bg_pairs.each do |x_mid, bg_pair|
121
+ b_x = bg_pair[0]['text_x'].to_f
122
+ g_x = bg_pair[1]['text_x'].to_f
123
+
124
+ # Find numbers closest to B and G positions
125
+ b_num = remaining.find { |row| (row['text_x'].to_f - b_x).abs < threshold }
126
+ remaining.delete(b_num) if b_num
127
+
128
+ g_num = remaining.find { |row| (row['text_x'].to_f - g_x).abs < threshold }
129
+ remaining.delete(g_num) if g_num
130
+
131
+ numbers[x_mid] = [b_num, g_num]
132
+ end
133
+
134
+ numbers
135
+ end
136
+ end
@@ -0,0 +1,81 @@
1
+ class EnrollmentHtmlWriter
2
+ def self.generate_html(data, html_path)
3
+ return unless data
4
+
5
+ grade_rows = data[:grade_rows]
6
+ bg_pairs = data[:bg_pairs]
7
+
8
+ categories = [
9
+ ['Gen', data[:gen_numbers]],
10
+ ['SC', data[:sc_numbers]],
11
+ ['ST', data[:st_numbers]],
12
+ ['OBC', data[:obc_numbers]],
13
+ ['Muslim', data[:musl_numbers]],
14
+ ['Christian', data[:chris_numbers]],
15
+ ['Sikh', data[:sikh_numbers]],
16
+ ['Buddhist', data[:budd_numbers]],
17
+ ['Parsi', data[:parsi_numbers]],
18
+ ['Jain', data[:jain_numbers]],
19
+ ['Others', data[:others_numbers]],
20
+ ['Aadhaar', data[:aadh_numbers]],
21
+ ['BPL', data[:bpl_numbers]],
22
+ ['Repeater', data[:rept_numbers]],
23
+ ['CWSN', data[:cwsn_numbers]]
24
+ ]
25
+
26
+ ages = (3..22).map do |age|
27
+ ["Age #{age}", data[:"age_#{age}_numbers"]]
28
+ end
29
+
30
+ # Generate table rows for all categories and ages
31
+ table_rows = (categories + ages).map do |category, numbers|
32
+ cells = bg_pairs.map do |x_mid, _|
33
+ nums = numbers[x_mid]
34
+ b_num = nums&.first
35
+ g_num = nums&.last
36
+ "<td>#{b_num ? b_num['text'] : ''}</td><td>#{g_num ? g_num['text'] : ''}</td>"
37
+ end.join
38
+
39
+ " <tr>\n" \
40
+ " <td class=\"category\">#{category}</td>\n" \
41
+ " #{cells}\n" \
42
+ " </tr>"
43
+ end.join("\n")
44
+
45
+ # Generate grade headers
46
+ grade_headers = grade_rows.map { |row| "<th colspan='2'>#{row['text']}</th>" }.join
47
+ bg_headers = grade_rows.map { |_| "<td>B</td><td>G</td>" }.join
48
+
49
+ html_content = <<~HTML
50
+ <!DOCTYPE html>
51
+ <html>
52
+ <head>
53
+ <title>Enrollment Table</title>
54
+ <style>
55
+ table { border-collapse: collapse; margin-top: 20px; width: 100%; }
56
+ th, td { border: 1px solid black; padding: 8px; text-align: center; }
57
+ .header { font-weight: bold; background-color: #f0f0f0; }
58
+ .grade { font-weight: bold; background-color: #e0e0e0; }
59
+ .bg-pair { background-color: #f8f8f8; }
60
+ .category { font-weight: bold; text-align: left; }
61
+ </style>
62
+ </head>
63
+ <body>
64
+ <h2>Enrolment (By Social Category)</h2>
65
+ <table>
66
+ <tr class="grade">
67
+ <th rowspan="2">Category</th>
68
+ #{grade_headers}
69
+ </tr>
70
+ <tr class="bg-pair">
71
+ #{bg_headers}
72
+ </tr>
73
+ #{table_rows}
74
+ </table>
75
+ </body>
76
+ </html>
77
+ HTML
78
+
79
+ File.write(html_path, html_content)
80
+ end
81
+ end
@@ -0,0 +1,62 @@
1
+ class EnrollmentYamlWriter
2
+ def self.format_yaml(data)
3
+ return unless data
4
+
5
+ grade_rows = data[:grade_rows]
6
+ bg_pairs = data[:bg_pairs]
7
+
8
+ categories = {
9
+ 'gen' => data[:gen_numbers],
10
+ 'sc' => data[:sc_numbers],
11
+ 'st' => data[:st_numbers],
12
+ 'obc' => data[:obc_numbers],
13
+ 'muslim' => data[:musl_numbers],
14
+ 'christian' => data[:chris_numbers],
15
+ 'sikh' => data[:sikh_numbers],
16
+ 'buddhist' => data[:budd_numbers],
17
+ 'parsi' => data[:parsi_numbers],
18
+ 'jain' => data[:jain_numbers],
19
+ 'others' => data[:others_numbers],
20
+ 'aadhaar' => data[:aadh_numbers],
21
+ 'bpl' => data[:bpl_numbers],
22
+ 'repeater' => data[:rept_numbers],
23
+ 'cwsn' => data[:cwsn_numbers]
24
+ }
25
+
26
+ yaml_data = {}
27
+
28
+ categories.each do |category, numbers|
29
+ yaml_data[category] = {}
30
+ bg_pairs.each_with_index do |(x_mid, _), index|
31
+ next unless grade_rows[index] && grade_rows[index]['text']
32
+ grade_name = grade_rows[index]['text'].downcase.gsub(' ', '_')
33
+ nums = numbers&.[](x_mid)
34
+ boys_text = nums&.first&.[]('text')&.strip
35
+ girls_text = nums&.last&.[]('text')&.strip
36
+ yaml_data[category][grade_name] = {
37
+ 'boys' => boys_text.nil? || boys_text.empty? ? nil : boys_text.to_i,
38
+ 'girls' => girls_text.nil? || girls_text.empty? ? nil : girls_text.to_i
39
+ }
40
+ end
41
+ end
42
+
43
+ # Handle age data
44
+ (3..22).each do |age|
45
+ age_numbers = data[:"age_#{age}_numbers"]
46
+ yaml_data["age_#{age}"] = {}
47
+ bg_pairs.each_with_index do |(x_mid, _), index|
48
+ next unless grade_rows[index] && grade_rows[index]['text']
49
+ grade_name = grade_rows[index]['text'].downcase.gsub(' ', '_')
50
+ nums = age_numbers&.[](x_mid)
51
+ boys_text = nums&.first&.[]('text')&.strip
52
+ girls_text = nums&.last&.[]('text')&.strip
53
+ yaml_data["age_#{age}"][grade_name] = {
54
+ 'boys' => boys_text.nil? || boys_text.empty? ? nil : boys_text.to_i,
55
+ 'girls' => girls_text.nil? || girls_text.empty? ? nil : girls_text.to_i
56
+ }
57
+ end
58
+ end
59
+
60
+ yaml_data
61
+ end
62
+ end
@@ -0,0 +1,118 @@
1
+ class EwsDataReader
2
+ GRADES = [
3
+ 'Pre-Pri.', 'Class I', 'Class II', 'Class III', 'Class IV', 'Class V',
4
+ 'Class VI', 'Class VII', 'Class VIII', 'Class IX', 'Class X', 'Class XI', 'Class XII'
5
+ ]
6
+
7
+ def self.read(csv_path) = new(csv_path).read
8
+
9
+ def initialize(csv_path)
10
+ @csv_path = csv_path
11
+ @rows = Hash.new { |h, k| h[k] = [] }
12
+
13
+ # Group cells by rect_y and rect_x
14
+ CSV.foreach(@csv_path, headers: true) do |cell|
15
+ next unless cell['page'] == '1'
16
+
17
+ rect_y = cell['rect_y'].to_f
18
+ @rows[rect_y] << cell
19
+ end
20
+
21
+ # Find the title row
22
+ @title_row = @rows.find { |_, cells| cells.any? { |cell| cell&.dig('text')&.include?('Total no. of Economically Weaker Section*(EWS) students Enrolled in Schools') } }
23
+
24
+ title_y = @title_row&.first
25
+ return unless title_y
26
+
27
+ # Get all rows below title in descending order
28
+ rows_after_title = @rows.select { |y, _| y < title_y.to_f }
29
+ .sort_by(&:first)
30
+ .reverse
31
+
32
+ # Get the next 3 rows after title
33
+ return unless rows_after_title.size >= 3
34
+
35
+ @grades_row = rows_after_title[0].last
36
+ @bg_row = rows_after_title[1].last
37
+ @values_row = rows_after_title[2].last
38
+
39
+ # Sort cells within each row by x coordinate
40
+ [@grades_row, @bg_row].each do |row|
41
+ next unless row
42
+ row.sort_by! { |cell| cell['text_x'].to_f }
43
+ end
44
+
45
+ # For values row, ensure we have a value for each B/G pair
46
+ if @values_row && @bg_row
47
+ sorted_values = []
48
+ @bg_row.each_slice(2) do |b, g|
49
+ b_x = b['text_x'].to_f
50
+ g_x = g['text_x'].to_f
51
+
52
+ # Find or create value for boys
53
+ b_val = @values_row.find { |cell| (cell['text_x'].to_f - b_x).abs < 10.0 }
54
+ b_val ||= { 'text' => '-', 'text_x' => b_x }
55
+ sorted_values << b_val
56
+
57
+ # Find or create value for girls
58
+ g_val = @values_row.find { |cell| (cell['text_x'].to_f - g_x).abs < 10.0 }
59
+ g_val ||= { 'text' => '-', 'text_x' => g_x }
60
+ sorted_values << g_val
61
+ end
62
+ @values_row = sorted_values
63
+ end
64
+
65
+ # Normalize empty values to "-"
66
+ @values_row&.each { |cell| cell['text'] = '-' if cell['text'].strip.empty? }
67
+
68
+ # Ensure we have all grades
69
+ found_grades = @grades_row.map { |cell| cell['text'] }
70
+ missing_grades = GRADES - found_grades
71
+ if missing_grades.any?
72
+ # Removed puts statement
73
+ end
74
+ end
75
+
76
+ def read
77
+ return nil unless @grades_row && @bg_row && @values_row
78
+
79
+ # Group B,G pairs, ensuring we have complete pairs
80
+ bg_pairs = {}
81
+ @bg_row.each_slice(2) do |pair|
82
+ next unless pair.size == 2 && pair[0] && pair[1] # Skip incomplete pairs
83
+ b, g = pair
84
+ x_mid = (b['text_x'].to_f + g['text_x'].to_f) / 2
85
+ bg_pairs[x_mid] = [b, g]
86
+ end
87
+
88
+ # Match numbers to pairs
89
+ {
90
+ grade_rows: @grades_row,
91
+ bg_pairs: bg_pairs,
92
+ ews_numbers: match_numbers_to_pairs(@values_row, bg_pairs),
93
+ }
94
+ end
95
+
96
+ private
97
+ def match_numbers_to_pairs(remaining_numbers, bg_pairs, threshold = 10.0)
98
+ numbers = {}
99
+ remaining = remaining_numbers.dup
100
+
101
+ bg_pairs.each do |x_mid, bg_pair|
102
+ next unless bg_pair && bg_pair.size == 2 # Skip invalid pairs
103
+ b_x = bg_pair[0]['text_x'].to_f
104
+ g_x = bg_pair[1]['text_x'].to_f
105
+
106
+ # Find numbers closest to B and G positions
107
+ b_num = remaining.find { |cell| (cell['text_x'].to_f - b_x).abs < threshold }
108
+ remaining.delete(b_num) if b_num
109
+
110
+ g_num = remaining.find { |cell| (cell['text_x'].to_f - g_x).abs < threshold }
111
+ remaining.delete(g_num) if g_num
112
+
113
+ numbers[x_mid] = [b_num, g_num]
114
+ end
115
+
116
+ numbers
117
+ end
118
+ end
@@ -0,0 +1,63 @@
1
+ class EwsHtmlWriter
2
+ def self.generate_html(data, html_path)
3
+ return unless data
4
+
5
+ grade_rows = data[:grade_rows]
6
+ bg_pairs = data[:bg_pairs]
7
+
8
+ categories = [
9
+ ['EWS', data[:ews_numbers]],
10
+ ]
11
+
12
+ # Generate table rows for all categories
13
+ table_rows = categories.map do |category, numbers|
14
+ cells = bg_pairs.map do |x_mid, _|
15
+ nums = numbers[x_mid]
16
+ b_num = nums&.first
17
+ g_num = nums&.last
18
+ "<td>#{b_num ? b_num['text'] : ''}</td><td>#{g_num ? g_num['text'] : ''}</td>"
19
+ end.join
20
+
21
+ " <tr>\n" \
22
+ " <td class=\"category\">#{category}</td>\n" \
23
+ " #{cells}\n" \
24
+ " </tr>"
25
+ end.join("\n")
26
+
27
+ # Generate grade headers
28
+ grade_headers = grade_rows.map { |row| "<th colspan='2'>#{row['text']}</th>" }.join
29
+ bg_headers = grade_rows.map { |_| "<td>B</td><td>G</td>" }.join
30
+
31
+ html_content = <<~HTML
32
+ <!DOCTYPE html>
33
+ <html>
34
+ <head>
35
+ <title>Enrollment Table</title>
36
+ <style>
37
+ table { border-collapse: collapse; margin-top: 20px; width: 100%; }
38
+ th, td { border: 1px solid black; padding: 8px; text-align: center; }
39
+ .header { font-weight: bold; background-color: #f0f0f0; }
40
+ .grade { font-weight: bold; background-color: #e0e0e0; }
41
+ .bg-pair { background-color: #f8f8f8; }
42
+ .category { font-weight: bold; text-align: left; }
43
+ </style>
44
+ </head>
45
+ <body>
46
+ <h2>Enrolment (By Social Category)</h2>
47
+ <table>
48
+ <tr class="grade">
49
+ <th rowspan="2">Category</th>
50
+ #{grade_headers}
51
+ </tr>
52
+ <tr class="bg-pair">
53
+ #{bg_headers}
54
+ </tr>
55
+ #{table_rows}
56
+ </table>
57
+ </body>
58
+ </html>
59
+ HTML
60
+
61
+ File.write(html_path, html_content)
62
+ end
63
+ end
@@ -0,0 +1,31 @@
1
+ class EwsYamlWriter
2
+ def self.format_yaml(data)
3
+ return unless data
4
+
5
+ grade_rows = data[:grade_rows]
6
+ bg_pairs = data[:bg_pairs]
7
+
8
+ categories = {
9
+ 'ews' => data[:ews_numbers],
10
+ }
11
+
12
+ yaml_data = {}
13
+
14
+ categories.each do |category, numbers|
15
+ yaml_data[category] = {}
16
+ bg_pairs.each_with_index do |(x_mid, _), index|
17
+ next unless grade_rows[index] && grade_rows[index]['text']
18
+ grade_name = grade_rows[index]['text'].downcase.gsub(' ', '_')
19
+ nums = numbers&.[](x_mid)
20
+ boys_text = nums&.first&.[]('text')&.strip
21
+ girls_text = nums&.last&.[]('text')&.strip
22
+ yaml_data[category][grade_name] = {
23
+ 'boys' => boys_text.nil? || boys_text.empty? ? nil : boys_text.to_i,
24
+ 'girls' => girls_text.nil? || girls_text.empty? ? nil : girls_text.to_i
25
+ }
26
+ end
27
+ end
28
+
29
+ yaml_data
30
+ end
31
+ end
@@ -0,0 +1,47 @@
1
+ require_relative 'data_reader_base'
2
+
3
+ class LocationDataReader
4
+ include DataReaderBase
5
+
6
+ FIELD_MAPPINGS = {
7
+ 'State' => {
8
+ key_path: ['location', 'state'],
9
+ end_pattern: /District/
10
+ },
11
+ 'District' => {
12
+ key_path: ['location', 'district'],
13
+ end_pattern: /Block/
14
+ },
15
+ 'Block' => {
16
+ key_path: ['location', 'block'],
17
+ end_pattern: /Rural/
18
+ },
19
+ 'Rural / Urban' => {
20
+ key_path: ['location', 'area_type'],
21
+ end_pattern: /Cluster/
22
+ },
23
+ 'Pincode' => {
24
+ key_path: ['location', 'pincode']
25
+ },
26
+ 'Ward' => {
27
+ key_path: ['location', 'ward'],
28
+ end_pattern: /Mohalla/
29
+ },
30
+ 'Cluster' => {
31
+ key_path: ['location', 'cluster'],
32
+ end_pattern: /Ward/
33
+ },
34
+ 'Municipality' => {
35
+ key_path: ['location', 'municipality'],
36
+ end_pattern: /Assembly/
37
+ },
38
+ 'Assembly Const.' => {
39
+ key_path: ['location', 'assembly_constituency'],
40
+ end_pattern: /Parl/
41
+ },
42
+ 'Parl. Constituency' => {
43
+ key_path: ['location', 'parliamentary_constituency'],
44
+ end_pattern: /School/
45
+ }
46
+ }
47
+ end
@@ -0,0 +1,40 @@
1
+ require_relative 'data_reader_base'
2
+
3
+ class OfficialDataReader
4
+ include DataReaderBase
5
+
6
+ FIELD_MAPPINGS = {
7
+ 'Year of Establishment' => {
8
+ key_path: ['official', 'established'],
9
+ value_type: :integer
10
+ },
11
+ 'Year of Recognition-Pri.' => {
12
+ key_path: ['official', 'recognition', 'primary'],
13
+ value_type: :integer
14
+ },
15
+ 'Year of Recognition-Upr.Pri.' => {
16
+ key_path: ['official', 'recognition', 'upper_primary'],
17
+ value_type: :integer
18
+ },
19
+ 'Year of Recognition-Sec.' => {
20
+ key_path: ['official', 'recognition', 'secondary'],
21
+ value_type: :integer
22
+ },
23
+ 'Year of Recognition-Higher Sec.' => {
24
+ key_path: ['official', 'recognition', 'higher_secondary'],
25
+ value_type: :integer
26
+ },
27
+ 'Affiliation Board-Sec' => {
28
+ key_path: ['official', 'affiliation', 'secondary'],
29
+ end_pattern: /Affiliation Board-HSec/
30
+ },
31
+ 'Affiliation Board-HSec' => {
32
+ key_path: ['official', 'affiliation', 'higher_secondary'],
33
+ end_pattern: /Is this/
34
+ },
35
+ 'School Management' => {
36
+ key_path: ['official', 'management'],
37
+ end_pattern: /School Type/
38
+ }
39
+ }
40
+ end
@@ -0,0 +1,49 @@
1
+ class PDFBlockExtractor
2
+ def self.extract_blocks(reader)
3
+ blocks = []
4
+
5
+ reader.pages.each_with_index do |page, index|
6
+ page_number = index + 1
7
+ current_block = {}
8
+
9
+ page.raw_content.each_line do |line|
10
+ if line.include?('BT')
11
+ current_block = {
12
+ page: page_number,
13
+ start_line: line.strip,
14
+ text: [] # Initialize as array to collect multiple text blocks
15
+ }
16
+ elsif line.match?(/1\s+0\s+0\s+1\s+(\d+\.?\d*)\s+(\d+\.?\d*)\s+Tm/)
17
+ # Only set coordinates if not already set
18
+ unless current_block[:x] && current_block[:y]
19
+ matches = line.match(/1\s+0\s+0\s+1\s+(\d+\.?\d*)\s+(\d+\.?\d*)\s+Tm/)
20
+ current_block[:x] = matches[1].to_f
21
+ current_block[:y] = matches[2].to_f
22
+ end
23
+ elsif line.match?(/\/F(\d+)\s+(\d+(\.\d+)?)\s+Tf/)
24
+ # Only set font if not already set
25
+ unless current_block[:font] && current_block[:font_size]
26
+ matches = line.match(/\/F(\d+)\s+(\d+(\.\d+)?)\s+Tf/)
27
+ current_block[:font] = "F#{matches[1]}"
28
+ current_block[:font_size] = matches[2].to_f
29
+ end
30
+ elsif line.match?(/\((.*?)\)\s*Tj/)
31
+ # Collect all text blocks, remove escape characters
32
+ text = line.match(/\((.*?)\)\s*Tj/)[1]
33
+ text = text.gsub(/\\/, '') # Remove escape characters
34
+ current_block[:text] << text
35
+ elsif line.include?('ET')
36
+ current_block[:end_line] = line.strip
37
+ # Join all text blocks with space
38
+ current_block[:text] = current_block[:text].join(' ')
39
+ # Only add non-empty blocks with coordinates
40
+ if !current_block[:text].empty? && current_block[:x] && current_block[:y]
41
+ blocks << current_block.dup
42
+ end
43
+ end
44
+ end
45
+ end
46
+
47
+ blocks
48
+ end
49
+ end
@@ -0,0 +1,36 @@
1
+ class PDFContentCompressor
2
+ def self.compress(content)
3
+ compressed = []
4
+ current_block = []
5
+ in_bt_block = false
6
+ current_text = ""
7
+
8
+ content.each_line do |line|
9
+ if line.include?('BT')
10
+ in_bt_block = true
11
+ current_block = []
12
+ current_text = ""
13
+ elsif line.include?('ET')
14
+ in_bt_block = false
15
+ current_text = current_block.join("")
16
+ compressed << current_text unless current_text.empty?
17
+ elsif in_bt_block && line =~ /\((.*?)\)\s*Tj/
18
+ # Extract text between (...) followed by Tj
19
+ text = $1.strip
20
+ if text =~ /^(?:Non|Residenti|al|Digit|al Facil|ities)$/
21
+ # Special handling for split text
22
+ current_text += text
23
+ current_block << text
24
+ else
25
+ if !current_text.empty?
26
+ compressed << current_text
27
+ end
28
+ current_text = text
29
+ current_block = [text]
30
+ end
31
+ end
32
+ end
33
+
34
+ compressed.reject(&:empty?).join("\n")
35
+ end
36
+ end